Source data

Preparation

Description of study data

The study data is stored in 2 seperate files: participant-info.csv & ahi-cesd.csv.

File participant-info.csv

Contains demographic information on participants:

  • id: participant id

  • intervention: 3 positive psychology interventions, plus 1 control condition:

    • 1 = “Using Signature Strengths”,
    • 2 = “Three Good Things”,
    • 3 = “Gratitude Visit”,
    • 4 = “Recording early memories” (control condition).
  • sex:

    • 1 = female,
    • 2 = male.
  • age: participant’s age (in years).

  • educ: level of education:

    • 1 = Less than Year 12,
    • 2 = Year 12,
    • 3 = Vocational training,
    • 4 = Bachelor’s degree,
    • 5 = Postgraduate degree.
  • income:

    • 1 = below average,
    • 2 = average,
    • 3 = above average.

File ahi-cesd.csv

Contains data of the 24 items of the Authentic Happiness Inventory (AHI) and answers to the 20 items of the Center for Epidemiological Studies Depression (CES-D) scale for (up to 6) measurement occasions.

  • id: Particpant ID

  • occasion: Measurement occasion:

    • 0 = Pretest (i.e., at enrolment),
    • 1 = Posttest (i.e., 7 days after pretest),
    • 2 = 1-week follow-up, (i.e., 14 days after pretest, 7 days after posttest),
    • 3 = 1-month follow-up, (i.e., 38 days after pretest, 31 days after posttest),
    • 4 = 3-month follow-up, (i.e., 98 days after pretest, 91 days after posttest),
    • 5 = 6-month follow-up, (i.e., 189 days after pretest, 182 days after posttest).
  • elapsed.days: time since enrolment measured in fractional days

  • intervention: intervention group

  • ahi01ahi24: responses on 24 AHI items

  • cesd01cesd20: responses on 20 CES-D items

  • ahiTotal: total AHI score

  • cesdTotal: total CES-D score

Screening and cleaning data

Import data

## Lisa:
ahi_cesd <- read_delim("~/Uni/7. Semester/HiWi/Data/Web-based positive psychology interventions/ahi-cesd.csv", ";", 
                       escape_double = FALSE, trim_ws = TRUE)

participant_info <- read_delim("~/Uni/7. Semester/HiWi/Data/Web-based positive psychology interventions/participant-info.csv", ";", 
                               escape_double = FALSE, trim_ws = TRUE)
## Hans: 

# library(data.table)

## Get & set current path:
# cur.path <- dirname(rstudioapi::getActiveDocumentContext()$path)
# cur.path
# setwd(cur.path) # set to current directory

my_path <- "/Users/hneth/Desktop/stuff/Dropbox/_code/R/_teachR/ds4psy/data/_posPsy"

ahi_cesd <- read_delim(paste0(my_path, "/ahi-cesd.csv"), delim = ",", 
                       escape_double = FALSE, trim_ws = TRUE, col_types = cols(elapsed.days = col_double()))
ahi_cesd
#> # A tibble: 992 x 50
#>       id occasion elapsed.days intervention ahi01 ahi02 ahi03 ahi04 ahi05
#>    <int>    <int>        <dbl>        <int> <int> <int> <int> <int> <int>
#>  1     1        0         0               4     2     3     2     3     3
#>  2     1        1        11.8             4     3     3     4     3     3
#>  3     2        0         0               1     3     4     3     4     2
#>  4     2        1         8.02            1     3     4     4     4     3
#>  5     2        2        14.3             1     3     4     4     4     3
#>  6     2        3        32.0             1     3     4     4     4     4
#>  7     2        4        92.2             1     3     3     2     3     3
#>  8     2        5       182.              1     3     3     3     4     2
#>  9     3        0         0               4     3     3     2     4     2
#> 10     3        2        16.4             4     3     3     3     4     4
#> # ... with 982 more rows, and 41 more variables: ahi06 <int>, ahi07 <int>,
#> #   ahi08 <int>, ahi09 <int>, ahi10 <int>, ahi11 <int>, ahi12 <int>,
#> #   ahi13 <int>, ahi14 <int>, ahi15 <int>, ahi16 <int>, ahi17 <int>,
#> #   ahi18 <int>, ahi19 <int>, ahi20 <int>, ahi21 <int>, ahi22 <int>,
#> #   ahi23 <int>, ahi24 <int>, cesd01 <int>, cesd02 <int>, cesd03 <int>,
#> #   cesd04 <int>, cesd05 <int>, cesd06 <int>, cesd07 <int>, cesd08 <int>,
#> #   cesd09 <int>, cesd10 <int>, cesd11 <int>, cesd12 <int>, cesd13 <int>,
#> #   cesd14 <int>, cesd15 <int>, cesd16 <int>, cesd17 <int>, cesd18 <int>,
#> #   cesd19 <int>, cesd20 <int>, ahiTotal <int>, cesdTotal <int>
# ahi_cesd$elapsed.days  # is double!

# ahi_cesd_2 <- readr::read_csv(paste0(my_path, "/ahi-cesd.csv"))
# ahi_cesd_2

participant_info <- read_delim(paste0(my_path, "/participant-info.csv"), ",",
                               escape_double = FALSE, trim_ws = TRUE)
# participant_info

# participant_info_2 <- readr::read_csv(paste0(my_path, "/participant-info.csv"))
# participant_info_2
                             
## Save data files:
## save(participant_info, file = "./data/p_info.RData")
# write_csv(participant_info, path = "/Users/hneth/Desktop/stuff/Dropbox/_code/R/_teachR/ds4psy/data/posPsy_participants.csv")
# write_csv(ahi_cesd, path = "/Users/hneth/Desktop/stuff/Dropbox/_code/R/_teachR/ds4psy/data/posPsy_AHI_CESD.csv")

## Restore data files (from online sources): ------ 

# p_info <- read_csv(file = "/Users/hneth/Desktop/stuff/Dropbox/_code/R/_teachR/ds4psy/data/posPsy_participants.csv")
p_info <- read_csv(file = "http://rpository.com/ds4psy/data/posPsy_participants.csv")  # online
# p_info
all.equal(participant_info, p_info) # should be TRUE
#> [1] TRUE


# AHI_CESD <- read_csv(file = "/Users/hneth/Desktop/stuff/Dropbox/_code/R/_teachR/ds4psy/data/posPsy_AHI_CESD.csv")
AHI_CESD <- read_csv(file = "http://rpository.com/ds4psy/data/posPsy_AHI_CESD.csv")  # online
# AHI_CESD

all.equal(ahi_cesd, AHI_CESD) # should be TRUE
#> [1] TRUE

Data screening

Run some simple checks:

  • How many occasions are there for each participant?
## Data:
# AHI_CESD

id_occ <- AHI_CESD %>%
  group_by(id, occasion) %>%
  count()
id_occ
#> # A tibble: 990 x 3
#> # Groups:   id, occasion [990]
#>       id occasion     n
#>    <int>    <int> <int>
#>  1     1        0     1
#>  2     1        1     1
#>  3     2        0     1
#>  4     2        1     1
#>  5     2        2     1
#>  6     2        3     1
#>  7     2        4     1
#>  8     2        5     1
#>  9     3        0     1
#> 10     3        2     1
#> # ... with 980 more rows

id_occ %>%
  filter(n != 1)
#> # A tibble: 2 x 3
#> # Groups:   id, occasion [2]
#>      id occasion     n
#>   <int>    <int> <int>
#> 1     8        2     2
#> 2    64        4     2
# => Participants with id of 8 and 64: 
#    2 instances of occasion 2 and 4, respectively.

# Spread by occasion:
id_occ_2 <- id_occ %>%
  spread(key = occasion, value = n)
id_occ_2
#> # A tibble: 295 x 7
#> # Groups:   id [295]
#>       id   `0`   `1`   `2`   `3`   `4`   `5`
#>    <int> <int> <int> <int> <int> <int> <int>
#>  1     1     1     1    NA    NA    NA    NA
#>  2     2     1     1     1     1     1     1
#>  3     3     1    NA     1    NA    NA    NA
#>  4     4     1     1    NA     1     1    NA
#>  5     5     1     1     1     1     1    NA
#>  6     6     1    NA     1     1    NA    NA
#>  7     7     1    NA    NA    NA    NA    NA
#>  8     8     1     1     2     1     1    NA
#>  9     9     1    NA    NA    NA    NA    NA
#> 10    10     1    NA    NA    NA    NA    NA
#> # ... with 285 more rows

id_occ_2 %>%
  filter(id == 8 | id == 64)
#> # A tibble: 2 x 7
#> # Groups:   id [2]
#>      id   `0`   `1`   `2`   `3`   `4`   `5`
#>   <int> <int> <int> <int> <int> <int> <int>
#> 1     8     1     1     2     1     1    NA
#> 2    64     1     1     1    NA     2     1

colSums(id_occ_2, na.rm = TRUE)
#>    id     0     1     2     3     4     5 
#> 43660   295   147   157   139   134   120

# occasion:     0     1     2     3     4     5 
# colSums:    295   147   157   139   134   120 

Note: 2 participants (8 and 64) are counted twice for an occasion (2 and 4, respectively).

Compare with Table 1 (p. 4, of Woodworth et al., 2018), which shows the number of participants who responded on each of the 6 measurement occasions): As the counts in this table correspond to ours, the repeated instances for some measurement occasions (which could indicate data entry errors, but also be due to large variability in the time inteval between measurements) were not reported in the original analysis (i.e., Table 1).

Distribution of occasions

Questions:

  • Which occasions are correct for id == 8 and id == 64?

  • How does the number of elapsed.days correspond to the measurement occasions?

  • How are the measurement times (elapsed.days) distributed overall (and relative to the stated times of each occasion)?

## Data:
# AHI_CESD

# From Table 1 (p. 4): 
occ_days <- c(0, 7, 14, 38, 98, 189)
names(occ_days) <- c("0: pre-test", "1: post-test", "2: 1-week", "3: 2-weeks", "4: 3-months", "5: 6-months")
occ_days
#>  0: pre-test 1: post-test    2: 1-week   3: 2-weeks  4: 3-months 
#>            0            7           14           38           98 
#>  5: 6-months 
#>          189

ggplot(AHI_CESD, aes(x = elapsed.days)) +
  geom_histogram(fill = "steelblue", binwidth = 1) +
  geom_vline(xintercept = occ_days, color = "firebrick", linetype = 2) +
  labs(title = "Distribution of occasions") +
  theme_bw()

Note: First 3 occasions are as expected. However, occasions 4 to 6 appear shifted to the left (i.e., were about 7 days earlier than stated).

  • What is the range of measurement times (elapsed.days) for each occasion?

(This should show outliers and allow correcting the occasions counted twice for the 2 participants with id 8 and 64).

## Data: ----- 
# AHI_CESD
# occ_days

# Occasion 0: 0 days  ----- 

i <- 0
AHI_CESD %>%
  filter(occasion == i) %>%
  ggplot(aes(x = elapsed.days)) +
  geom_histogram(fill = "steelblue", binwidth = 1) +
  geom_vline(xintercept = occ_days[i + 1], color = "firebrick", linetype = 2) +
  labs(title = paste0("Distribution of occasion ", i, " (", names(occ_days)[i + 1], ")")) +
  theme_bw()

# => No variation, all 295 participants start with exactly 0, as expected.

# Occasion 1: 7 days ----- 

i <- 1
AHI_CESD %>%
  filter(occasion == i) %>%
  ggplot(aes(x = elapsed.days)) +
  geom_histogram(fill = "steelblue", binwidth = 1) +
  geom_vline(xintercept = occ_days[i + 1], color = "firebrick", linetype = 2) +
  labs(title = paste0("Distribution of occasion ", i, " (", names(occ_days)[i + 1], ")")) +
  theme_bw()

# => Substantial range, 2 outliers (1 at 0 = occasion 0, and 1 at 25).

# Check early outliers:
AHI_CESD %>%
  filter(occasion == 1, elapsed.days < 1)
#> # A tibble: 1 x 50
#>      id occasion elapsed.days intervention ahi01 ahi02 ahi03 ahi04 ahi05
#>   <int>    <int>        <dbl>        <int> <int> <int> <int> <int> <int>
#> 1   144        1            0            1     3     3     2     2     2
#> # ... with 41 more variables: ahi06 <int>, ahi07 <int>, ahi08 <int>,
#> #   ahi09 <int>, ahi10 <int>, ahi11 <int>, ahi12 <int>, ahi13 <int>,
#> #   ahi14 <int>, ahi15 <int>, ahi16 <int>, ahi17 <int>, ahi18 <int>,
#> #   ahi19 <int>, ahi20 <int>, ahi21 <int>, ahi22 <int>, ahi23 <int>,
#> #   ahi24 <int>, cesd01 <int>, cesd02 <int>, cesd03 <int>, cesd04 <int>,
#> #   cesd05 <int>, cesd06 <int>, cesd07 <int>, cesd08 <int>, cesd09 <int>,
#> #   cesd10 <int>, cesd11 <int>, cesd12 <int>, cesd13 <int>, cesd14 <int>,
#> #   cesd15 <int>, cesd16 <int>, cesd17 <int>, cesd18 <int>, cesd19 <int>,
#> #   cesd20 <int>, ahiTotal <int>, cesdTotal <int>
# id = 144.

# Check late outliers:
AHI_CESD %>%
  filter(occasion == 1, elapsed.days > 24)
#> # A tibble: 1 x 50
#>      id occasion elapsed.days intervention ahi01 ahi02 ahi03 ahi04 ahi05
#>   <int>    <int>        <dbl>        <int> <int> <int> <int> <int> <int>
#> 1   226        1         24.9            2     3     3     4     3     3
#> # ... with 41 more variables: ahi06 <int>, ahi07 <int>, ahi08 <int>,
#> #   ahi09 <int>, ahi10 <int>, ahi11 <int>, ahi12 <int>, ahi13 <int>,
#> #   ahi14 <int>, ahi15 <int>, ahi16 <int>, ahi17 <int>, ahi18 <int>,
#> #   ahi19 <int>, ahi20 <int>, ahi21 <int>, ahi22 <int>, ahi23 <int>,
#> #   ahi24 <int>, cesd01 <int>, cesd02 <int>, cesd03 <int>, cesd04 <int>,
#> #   cesd05 <int>, cesd06 <int>, cesd07 <int>, cesd08 <int>, cesd09 <int>,
#> #   cesd10 <int>, cesd11 <int>, cesd12 <int>, cesd13 <int>, cesd14 <int>,
#> #   cesd15 <int>, cesd16 <int>, cesd17 <int>, cesd18 <int>, cesd19 <int>,
#> #   cesd20 <int>, ahiTotal <int>, cesdTotal <int>
# id = 226.

# Occasion 2: 14 days  ----- 

i <- 2
AHI_CESD %>%
  filter(occasion == i) %>%
  ggplot(aes(x = elapsed.days)) +
  geom_histogram(fill = "steelblue", binwidth = 1) +
  geom_vline(xintercept = occ_days[i + 1], color = "firebrick", linetype = 2) +
  labs(title = paste0("Distribution of occasion ", i, " (", names(occ_days)[i + 1], ")")) +
  theme_bw()

# => Increasing range, several outliers (2 at 7 days = occasion 1, and 2 in range of 38+ days, i.e., occasion 3).

# Check early outliers:
AHI_CESD %>%
  filter(occasion == 2, elapsed.days < 8)
#> # A tibble: 2 x 50
#>      id occasion elapsed.days intervention ahi01 ahi02 ahi03 ahi04 ahi05
#>   <int>    <int>        <dbl>        <int> <int> <int> <int> <int> <int>
#> 1   144        2         7.10            1     3     3     2     2     2
#> 2   275        2         7.18            3     4     4     3     4     4
#> # ... with 41 more variables: ahi06 <int>, ahi07 <int>, ahi08 <int>,
#> #   ahi09 <int>, ahi10 <int>, ahi11 <int>, ahi12 <int>, ahi13 <int>,
#> #   ahi14 <int>, ahi15 <int>, ahi16 <int>, ahi17 <int>, ahi18 <int>,
#> #   ahi19 <int>, ahi20 <int>, ahi21 <int>, ahi22 <int>, ahi23 <int>,
#> #   ahi24 <int>, cesd01 <int>, cesd02 <int>, cesd03 <int>, cesd04 <int>,
#> #   cesd05 <int>, cesd06 <int>, cesd07 <int>, cesd08 <int>, cesd09 <int>,
#> #   cesd10 <int>, cesd11 <int>, cesd12 <int>, cesd13 <int>, cesd14 <int>,
#> #   cesd15 <int>, cesd16 <int>, cesd17 <int>, cesd18 <int>, cesd19 <int>,
#> #   cesd20 <int>, ahiTotal <int>, cesdTotal <int>
# id = 144 (as above), and 275 (new).

# Check late outliers:
AHI_CESD %>%
  filter(occasion == 2, elapsed.days > 38)
#> # A tibble: 2 x 50
#>      id occasion elapsed.days intervention ahi01 ahi02 ahi03 ahi04 ahi05
#>   <int>    <int>        <dbl>        <int> <int> <int> <int> <int> <int>
#> 1     8        2         38.9            2     2     2     1     1     2
#> 2   232        2         48.8            1     3     3     3     4     4
#> # ... with 41 more variables: ahi06 <int>, ahi07 <int>, ahi08 <int>,
#> #   ahi09 <int>, ahi10 <int>, ahi11 <int>, ahi12 <int>, ahi13 <int>,
#> #   ahi14 <int>, ahi15 <int>, ahi16 <int>, ahi17 <int>, ahi18 <int>,
#> #   ahi19 <int>, ahi20 <int>, ahi21 <int>, ahi22 <int>, ahi23 <int>,
#> #   ahi24 <int>, cesd01 <int>, cesd02 <int>, cesd03 <int>, cesd04 <int>,
#> #   cesd05 <int>, cesd06 <int>, cesd07 <int>, cesd08 <int>, cesd09 <int>,
#> #   cesd10 <int>, cesd11 <int>, cesd12 <int>, cesd13 <int>, cesd14 <int>,
#> #   cesd15 <int>, cesd16 <int>, cesd17 <int>, cesd18 <int>, cesd19 <int>,
#> #   cesd20 <int>, ahiTotal <int>, cesdTotal <int>
# id = 8 (noted above), and 232 (new).


# Occasion 3: 38 days -----

i <- 3
AHI_CESD %>%
  filter(occasion == i) %>%
  ggplot(aes(x = elapsed.days)) +
  geom_histogram(fill = "steelblue", binwidth = 1) +
  geom_vline(xintercept = occ_days[i + 1], color = "firebrick", linetype = 2) +
  labs(title = paste0("Distribution of occasion ", i, " (", names(occ_days)[i + 1], ")")) +
  theme_bw()

# => Marked shift by about 7 days to left (i.e., earlier than stated), 
#    increasing range, several outliers (but NOT in range of occasions 2 and 4).

# Check early outliers:
AHI_CESD %>%
  filter(occasion == 3, elapsed.days < 30)
#> # A tibble: 3 x 50
#>      id occasion elapsed.days intervention ahi01 ahi02 ahi03 ahi04 ahi05
#>   <int>    <int>        <dbl>        <int> <int> <int> <int> <int> <int>
#> 1    76        3         29.8            1     4     4     2     4     4
#> 2   144        3         24.7            1     3     3     2     3     3
#> 3   275        3         27.9            3     4     4     3     4     4
#> # ... with 41 more variables: ahi06 <int>, ahi07 <int>, ahi08 <int>,
#> #   ahi09 <int>, ahi10 <int>, ahi11 <int>, ahi12 <int>, ahi13 <int>,
#> #   ahi14 <int>, ahi15 <int>, ahi16 <int>, ahi17 <int>, ahi18 <int>,
#> #   ahi19 <int>, ahi20 <int>, ahi21 <int>, ahi22 <int>, ahi23 <int>,
#> #   ahi24 <int>, cesd01 <int>, cesd02 <int>, cesd03 <int>, cesd04 <int>,
#> #   cesd05 <int>, cesd06 <int>, cesd07 <int>, cesd08 <int>, cesd09 <int>,
#> #   cesd10 <int>, cesd11 <int>, cesd12 <int>, cesd13 <int>, cesd14 <int>,
#> #   cesd15 <int>, cesd16 <int>, cesd17 <int>, cesd18 <int>, cesd19 <int>,
#> #   cesd20 <int>, ahiTotal <int>, cesdTotal <int>
# id = 144 (as above), and 76, 275 (new).

# Check late outliers:
AHI_CESD %>%
  filter(occasion == 3, elapsed.days > 50)
#> # A tibble: 3 x 50
#>      id occasion elapsed.days intervention ahi01 ahi02 ahi03 ahi04 ahi05
#>   <int>    <int>        <dbl>        <int> <int> <int> <int> <int> <int>
#> 1     8        3         50.6            2     3     2     1     2     1
#> 2   199        3         55.8            1     3     3     3     4     3
#> 3   232        3         61.8            1     3     3     3     4     3
#> # ... with 41 more variables: ahi06 <int>, ahi07 <int>, ahi08 <int>,
#> #   ahi09 <int>, ahi10 <int>, ahi11 <int>, ahi12 <int>, ahi13 <int>,
#> #   ahi14 <int>, ahi15 <int>, ahi16 <int>, ahi17 <int>, ahi18 <int>,
#> #   ahi19 <int>, ahi20 <int>, ahi21 <int>, ahi22 <int>, ahi23 <int>,
#> #   ahi24 <int>, cesd01 <int>, cesd02 <int>, cesd03 <int>, cesd04 <int>,
#> #   cesd05 <int>, cesd06 <int>, cesd07 <int>, cesd08 <int>, cesd09 <int>,
#> #   cesd10 <int>, cesd11 <int>, cesd12 <int>, cesd13 <int>, cesd14 <int>,
#> #   cesd15 <int>, cesd16 <int>, cesd17 <int>, cesd18 <int>, cesd19 <int>,
#> #   cesd20 <int>, ahiTotal <int>, cesdTotal <int>
# id = 8 and 232 (noted above), and 199 (new).


# Occasion 4: 98 days ----- 

i <- 4
AHI_CESD %>%
  filter(occasion == i) %>%
  ggplot(aes(x = elapsed.days)) +
  geom_histogram(fill = "steelblue", binwidth = 1) +
  geom_vline(xintercept = occ_days[i + 1], color = "firebrick", linetype = 2) +
  labs(title = paste0("Distribution of occasion ", i, " (", names(occ_days)[i + 1], ")")) +
  theme_bw()

# => Marked shift by about 7 days to left (i.e., earlier than stated), 
#    increasing range, several outliers (but NOT in range of occasions 3 and 5).

# Check early outliers:
AHI_CESD %>%
  filter(occasion == 4, elapsed.days < 90)
#> # A tibble: 2 x 50
#>      id occasion elapsed.days intervention ahi01 ahi02 ahi03 ahi04 ahi05
#>   <int>    <int>        <dbl>        <int> <int> <int> <int> <int> <int>
#> 1    53        4         89.9            1     3     3     3     3     3
#> 2   144        4         87.7            1     3     3     2     4     3
#> # ... with 41 more variables: ahi06 <int>, ahi07 <int>, ahi08 <int>,
#> #   ahi09 <int>, ahi10 <int>, ahi11 <int>, ahi12 <int>, ahi13 <int>,
#> #   ahi14 <int>, ahi15 <int>, ahi16 <int>, ahi17 <int>, ahi18 <int>,
#> #   ahi19 <int>, ahi20 <int>, ahi21 <int>, ahi22 <int>, ahi23 <int>,
#> #   ahi24 <int>, cesd01 <int>, cesd02 <int>, cesd03 <int>, cesd04 <int>,
#> #   cesd05 <int>, cesd06 <int>, cesd07 <int>, cesd08 <int>, cesd09 <int>,
#> #   cesd10 <int>, cesd11 <int>, cesd12 <int>, cesd13 <int>, cesd14 <int>,
#> #   cesd15 <int>, cesd16 <int>, cesd17 <int>, cesd18 <int>, cesd19 <int>,
#> #   cesd20 <int>, ahiTotal <int>, cesdTotal <int>
# id = 144 (as above), and 53 (new).

# Check late outliers:
AHI_CESD %>%
  filter(occasion == 4, elapsed.days > 120)
#> # A tibble: 4 x 50
#>      id occasion elapsed.days intervention ahi01 ahi02 ahi03 ahi04 ahi05
#>   <int>    <int>        <dbl>        <int> <int> <int> <int> <int> <int>
#> 1     8        4         121.            2     2     1     2     2     1
#> 2    64        4         140.            1     3     3     2     3     2
#> 3   180        4         127.            1     3     4     3     4     4
#> 4   232        4         123.            1     3     2     3     4     4
#> # ... with 41 more variables: ahi06 <int>, ahi07 <int>, ahi08 <int>,
#> #   ahi09 <int>, ahi10 <int>, ahi11 <int>, ahi12 <int>, ahi13 <int>,
#> #   ahi14 <int>, ahi15 <int>, ahi16 <int>, ahi17 <int>, ahi18 <int>,
#> #   ahi19 <int>, ahi20 <int>, ahi21 <int>, ahi22 <int>, ahi23 <int>,
#> #   ahi24 <int>, cesd01 <int>, cesd02 <int>, cesd03 <int>, cesd04 <int>,
#> #   cesd05 <int>, cesd06 <int>, cesd07 <int>, cesd08 <int>, cesd09 <int>,
#> #   cesd10 <int>, cesd11 <int>, cesd12 <int>, cesd13 <int>, cesd14 <int>,
#> #   cesd15 <int>, cesd16 <int>, cesd17 <int>, cesd18 <int>, cesd19 <int>,
#> #   cesd20 <int>, ahiTotal <int>, cesdTotal <int>
# id = 8 and 232 (noted above), and 64 (noted above), and 180 (new).


# Occasion 5: 189 days ----- 

i <- 5
AHI_CESD %>%
  filter(occasion == i) %>%
  ggplot(aes(x = elapsed.days)) +
  geom_histogram(fill = "steelblue", binwidth = 1) +
  geom_vline(xintercept = occ_days[i + 1], color = "firebrick", linetype = 2) +
  labs(title = paste0("Distribution of occasion ", i, " (", names(occ_days)[i + 1], ")")) +
  theme_bw()

# => Marked shift by about 7 days to left (i.e., earlier than stated), 
#    increasing range, several outliers (but NOT in range of earlier occasions).

# Check early outliers:
AHI_CESD %>%
  filter(occasion == 5, elapsed.days < 181)
#> # A tibble: 0 x 50
#> # ... with 50 variables: id <int>, occasion <int>, elapsed.days <dbl>,
#> #   intervention <int>, ahi01 <int>, ahi02 <int>, ahi03 <int>,
#> #   ahi04 <int>, ahi05 <int>, ahi06 <int>, ahi07 <int>, ahi08 <int>,
#> #   ahi09 <int>, ahi10 <int>, ahi11 <int>, ahi12 <int>, ahi13 <int>,
#> #   ahi14 <int>, ahi15 <int>, ahi16 <int>, ahi17 <int>, ahi18 <int>,
#> #   ahi19 <int>, ahi20 <int>, ahi21 <int>, ahi22 <int>, ahi23 <int>,
#> #   ahi24 <int>, cesd01 <int>, cesd02 <int>, cesd03 <int>, cesd04 <int>,
#> #   cesd05 <int>, cesd06 <int>, cesd07 <int>, cesd08 <int>, cesd09 <int>,
#> #   cesd10 <int>, cesd11 <int>, cesd12 <int>, cesd13 <int>, cesd14 <int>,
#> #   cesd15 <int>, cesd16 <int>, cesd17 <int>, cesd18 <int>, cesd19 <int>,
#> #   cesd20 <int>, ahiTotal <int>, cesdTotal <int>
# => none.

# Check late outliers:
AHI_CESD %>%
  filter(occasion == 5, elapsed.days > 209)
#> # A tibble: 4 x 50
#>      id occasion elapsed.days intervention ahi01 ahi02 ahi03 ahi04 ahi05
#>   <int>    <int>        <dbl>        <int> <int> <int> <int> <int> <int>
#> 1    57        5         212.            4     2     2     3     3     2
#> 2   197        5         211.            4     3     3     3     4     3
#> 3   224        5         210.            1     3     3     3     4     4
#> 4   232        5         224.            1     3     3     3     4     4
#> # ... with 41 more variables: ahi06 <int>, ahi07 <int>, ahi08 <int>,
#> #   ahi09 <int>, ahi10 <int>, ahi11 <int>, ahi12 <int>, ahi13 <int>,
#> #   ahi14 <int>, ahi15 <int>, ahi16 <int>, ahi17 <int>, ahi18 <int>,
#> #   ahi19 <int>, ahi20 <int>, ahi21 <int>, ahi22 <int>, ahi23 <int>,
#> #   ahi24 <int>, cesd01 <int>, cesd02 <int>, cesd03 <int>, cesd04 <int>,
#> #   cesd05 <int>, cesd06 <int>, cesd07 <int>, cesd08 <int>, cesd09 <int>,
#> #   cesd10 <int>, cesd11 <int>, cesd12 <int>, cesd13 <int>, cesd14 <int>,
#> #   cesd15 <int>, cesd16 <int>, cesd17 <int>, cesd18 <int>, cesd19 <int>,
#> #   cesd20 <int>, ahiTotal <int>, cesdTotal <int>
# => 232 (noted above), and 57, 57, 197, 224 (new).

Result: The following Table summarizes early and late outliers (i.e., their id) by occasion:

Occasion early late
1 144 226
2 144, 275 8, 232
3 76, 144, 275 8, 199, 232
4 144 8, 64, 180, 232
5 57, 197, 224, 232

Note that id values appearing repeatedly are shown in bold font. The id value of 64 is in italics, as this participant was noted above (as having 2 instances of occasion 4).

Inspect individual participants

Pieces of a puzzle:

  • Being early poses a bigger puzzle than being late.

  • Being consistently late could make perfect sense, if future invites depend on a minimum distance to an earlier occasion.

  • Being consistently early could indicate measurement errors.

To decide how to proceed, we first inspect the data of the participants noted repeatedly and correct some (in a copy of the data).

## Data: ----- 
# AHI_CESD
# occ_days

## Copy data (to keep original):
copy <- AHI_CESD

# id == 144: ----- 

AHI_CESD %>% 
  filter(id == 144) %>%
  select(id:intervention, ahiTotal, cesdTotal)
#> # A tibble: 5 x 6
#>      id occasion elapsed.days intervention ahiTotal cesdTotal
#>   <int>    <int>        <dbl>        <int>    <int>     <int>
#> 1   144        0         0               1       59        18
#> 2   144        1         0               1       63         8
#> 3   144        2         7.10            1       69         5
#> 4   144        3        24.7             1       73         6
#> 5   144        4        87.7             1       75         3

# Interpretation:
# 0 to 4 measurements, 
# Occasion 0 and 1 (pre- and post-test) were both on the same day (0).
# However, values are different: 

# Note the BIG decrease (from 18 to 8 points) in cesdTotal from occasion 0 to 1.

# Decision: 
# Leave as is. 


# id == 275: ----- 

AHI_CESD %>% 
  filter(id == 275) %>%
  select(id:intervention, ahiTotal, cesdTotal)
#> # A tibble: 3 x 6
#>      id occasion elapsed.days intervention ahiTotal cesdTotal
#>   <int>    <int>        <dbl>        <int>    <int>     <int>
#> 1   275        0         0               3       82         4
#> 2   275        2         7.18            3       92         4
#> 3   275        3        27.9             3       91         5

# Interpretation:
# Only 3 measurements, 
# Occasion 1 (post-test) is missing, 
# and time point of Occasions 2 and 3 is too early to be possible (and closer to means of earlier occasions than stated ones).

# Decision: 
# Correct copy by decrementing occasion values 2 and 3 to 1 and 2, respectively. 

copy[(copy$id == 275), ]  # original values
#> # A tibble: 3 x 50
#>      id occasion elapsed.days intervention ahi01 ahi02 ahi03 ahi04 ahi05
#>   <int>    <int>        <dbl>        <int> <int> <int> <int> <int> <int>
#> 1   275        0         0               3     3     3     2     4     4
#> 2   275        2         7.18            3     4     4     3     4     4
#> 3   275        3        27.9             3     4     4     3     4     4
#> # ... with 41 more variables: ahi06 <int>, ahi07 <int>, ahi08 <int>,
#> #   ahi09 <int>, ahi10 <int>, ahi11 <int>, ahi12 <int>, ahi13 <int>,
#> #   ahi14 <int>, ahi15 <int>, ahi16 <int>, ahi17 <int>, ahi18 <int>,
#> #   ahi19 <int>, ahi20 <int>, ahi21 <int>, ahi22 <int>, ahi23 <int>,
#> #   ahi24 <int>, cesd01 <int>, cesd02 <int>, cesd03 <int>, cesd04 <int>,
#> #   cesd05 <int>, cesd06 <int>, cesd07 <int>, cesd08 <int>, cesd09 <int>,
#> #   cesd10 <int>, cesd11 <int>, cesd12 <int>, cesd13 <int>, cesd14 <int>,
#> #   cesd15 <int>, cesd16 <int>, cesd17 <int>, cesd18 <int>, cesd19 <int>,
#> #   cesd20 <int>, ahiTotal <int>, cesdTotal <int>
copy[(copy$id == 275) & (copy$occasion == 2), ]$occasion       # is 2
#> [1] 2
copy[(copy$id == 275) & (copy$occasion == 2), ]$occasion <- 1  # correct to 1
copy[(copy$id == 275) & (copy$occasion == 3), ]$occasion       # is 3
#> [1] 3
copy[(copy$id == 275) & (copy$occasion == 3), ]$occasion <- 2  # correct to 2
copy[(copy$id == 275), ]  # corrected values
#> # A tibble: 3 x 50
#>      id occasion elapsed.days intervention ahi01 ahi02 ahi03 ahi04 ahi05
#>   <int>    <dbl>        <dbl>        <int> <int> <int> <int> <int> <int>
#> 1   275        0         0               3     3     3     2     4     4
#> 2   275        1         7.18            3     4     4     3     4     4
#> 3   275        2        27.9             3     4     4     3     4     4
#> # ... with 41 more variables: ahi06 <int>, ahi07 <int>, ahi08 <int>,
#> #   ahi09 <int>, ahi10 <int>, ahi11 <int>, ahi12 <int>, ahi13 <int>,
#> #   ahi14 <int>, ahi15 <int>, ahi16 <int>, ahi17 <int>, ahi18 <int>,
#> #   ahi19 <int>, ahi20 <int>, ahi21 <int>, ahi22 <int>, ahi23 <int>,
#> #   ahi24 <int>, cesd01 <int>, cesd02 <int>, cesd03 <int>, cesd04 <int>,
#> #   cesd05 <int>, cesd06 <int>, cesd07 <int>, cesd08 <int>, cesd09 <int>,
#> #   cesd10 <int>, cesd11 <int>, cesd12 <int>, cesd13 <int>, cesd14 <int>,
#> #   cesd15 <int>, cesd16 <int>, cesd17 <int>, cesd18 <int>, cesd19 <int>,
#> #   cesd20 <int>, ahiTotal <int>, cesdTotal <int>


# id == 8: ----- 

AHI_CESD %>% 
  filter(id == 8) %>%
  select(id:intervention, ahiTotal, cesdTotal)
#> # A tibble: 6 x 6
#>      id occasion elapsed.days intervention ahiTotal cesdTotal
#>   <int>    <int>        <dbl>        <int>    <int>     <int>
#> 1     8        0         0               2       59        30
#> 2     8        1         8.80            2       45        45
#> 3     8        2        21.9             2       38        55
#> 4     8        2        38.9             2       44        49
#> 5     8        3        50.6             2       48        34
#> 6     8        4       121.              2       43        51

# Interpretation:
# 6 measurements, but only 5 occasion values (Occasion 2 twice):
# However, elapsed.days are in order and last 2 measurements too early for Occasions 4 and 5.
# Thus, the 2nd instance of Occasion 2 could have been Occasion 3, 
# but current values of 3 and 4 cannot have been 4 and 5.

# Note lower ahiTotal values and larger cesdTotal values than previous participants.

# Decision:
# Correct copy by removing 2nd instance of Occasion 2, but leave the rest as is.
# Alternatively, we could select a random instance or average the total values of both Occasion 2 instances. 

copy  # current copy (992 rows)
#> # A tibble: 992 x 50
#>       id occasion elapsed.days intervention ahi01 ahi02 ahi03 ahi04 ahi05
#>    <int>    <dbl>        <dbl>        <int> <int> <int> <int> <int> <int>
#>  1     1        0         0               4     2     3     2     3     3
#>  2     1        1        11.8             4     3     3     4     3     3
#>  3     2        0         0               1     3     4     3     4     2
#>  4     2        1         8.02            1     3     4     4     4     3
#>  5     2        2        14.3             1     3     4     4     4     3
#>  6     2        3        32.0             1     3     4     4     4     4
#>  7     2        4        92.2             1     3     3     2     3     3
#>  8     2        5       182.              1     3     3     3     4     2
#>  9     3        0         0               4     3     3     2     4     2
#> 10     3        2        16.4             4     3     3     3     4     4
#> # ... with 982 more rows, and 41 more variables: ahi06 <int>, ahi07 <int>,
#> #   ahi08 <int>, ahi09 <int>, ahi10 <int>, ahi11 <int>, ahi12 <int>,
#> #   ahi13 <int>, ahi14 <int>, ahi15 <int>, ahi16 <int>, ahi17 <int>,
#> #   ahi18 <int>, ahi19 <int>, ahi20 <int>, ahi21 <int>, ahi22 <int>,
#> #   ahi23 <int>, ahi24 <int>, cesd01 <int>, cesd02 <int>, cesd03 <int>,
#> #   cesd04 <int>, cesd05 <int>, cesd06 <int>, cesd07 <int>, cesd08 <int>,
#> #   cesd09 <int>, cesd10 <int>, cesd11 <int>, cesd12 <int>, cesd13 <int>,
#> #   cesd14 <int>, cesd15 <int>, cesd16 <int>, cesd17 <int>, cesd18 <int>,
#> #   cesd19 <int>, cesd20 <int>, ahiTotal <int>, cesdTotal <int>
copy %>% filter(id == 8)  # original lines
#> # A tibble: 6 x 50
#>      id occasion elapsed.days intervention ahi01 ahi02 ahi03 ahi04 ahi05
#>   <int>    <dbl>        <dbl>        <int> <int> <int> <int> <int> <int>
#> 1     8        0         0               2     3     2     2     2     2
#> 2     8        1         8.80            2     2     2     1     2     2
#> 3     8        2        21.9             2     2     1     1     1     1
#> 4     8        2        38.9             2     2     2     1     1     2
#> 5     8        3        50.6             2     3     2     1     2     1
#> 6     8        4       121.              2     2     1     2     2     1
#> # ... with 41 more variables: ahi06 <int>, ahi07 <int>, ahi08 <int>,
#> #   ahi09 <int>, ahi10 <int>, ahi11 <int>, ahi12 <int>, ahi13 <int>,
#> #   ahi14 <int>, ahi15 <int>, ahi16 <int>, ahi17 <int>, ahi18 <int>,
#> #   ahi19 <int>, ahi20 <int>, ahi21 <int>, ahi22 <int>, ahi23 <int>,
#> #   ahi24 <int>, cesd01 <int>, cesd02 <int>, cesd03 <int>, cesd04 <int>,
#> #   cesd05 <int>, cesd06 <int>, cesd07 <int>, cesd08 <int>, cesd09 <int>,
#> #   cesd10 <int>, cesd11 <int>, cesd12 <int>, cesd13 <int>, cesd14 <int>,
#> #   cesd15 <int>, cesd16 <int>, cesd17 <int>, cesd18 <int>, cesd19 <int>,
#> #   cesd20 <int>, ahiTotal <int>, cesdTotal <int>
copy %>% filter(id == 8, occasion == 2, elapsed.days > 30)  # 1 line to be removed
#> # A tibble: 1 x 50
#>      id occasion elapsed.days intervention ahi01 ahi02 ahi03 ahi04 ahi05
#>   <int>    <dbl>        <dbl>        <int> <int> <int> <int> <int> <int>
#> 1     8        2         38.9            2     2     2     1     1     2
#> # ... with 41 more variables: ahi06 <int>, ahi07 <int>, ahi08 <int>,
#> #   ahi09 <int>, ahi10 <int>, ahi11 <int>, ahi12 <int>, ahi13 <int>,
#> #   ahi14 <int>, ahi15 <int>, ahi16 <int>, ahi17 <int>, ahi18 <int>,
#> #   ahi19 <int>, ahi20 <int>, ahi21 <int>, ahi22 <int>, ahi23 <int>,
#> #   ahi24 <int>, cesd01 <int>, cesd02 <int>, cesd03 <int>, cesd04 <int>,
#> #   cesd05 <int>, cesd06 <int>, cesd07 <int>, cesd08 <int>, cesd09 <int>,
#> #   cesd10 <int>, cesd11 <int>, cesd12 <int>, cesd13 <int>, cesd14 <int>,
#> #   cesd15 <int>, cesd16 <int>, cesd17 <int>, cesd18 <int>, cesd19 <int>,
#> #   cesd20 <int>, ahiTotal <int>, cesdTotal <int>
copy %>% filter(!(id == 8) | !(occasion == 2) | !(elapsed.days > 30))  # the other 991 lines
#> # A tibble: 991 x 50
#>       id occasion elapsed.days intervention ahi01 ahi02 ahi03 ahi04 ahi05
#>    <int>    <dbl>        <dbl>        <int> <int> <int> <int> <int> <int>
#>  1     1        0         0               4     2     3     2     3     3
#>  2     1        1        11.8             4     3     3     4     3     3
#>  3     2        0         0               1     3     4     3     4     2
#>  4     2        1         8.02            1     3     4     4     4     3
#>  5     2        2        14.3             1     3     4     4     4     3
#>  6     2        3        32.0             1     3     4     4     4     4
#>  7     2        4        92.2             1     3     3     2     3     3
#>  8     2        5       182.              1     3     3     3     4     2
#>  9     3        0         0               4     3     3     2     4     2
#> 10     3        2        16.4             4     3     3     3     4     4
#> # ... with 981 more rows, and 41 more variables: ahi06 <int>, ahi07 <int>,
#> #   ahi08 <int>, ahi09 <int>, ahi10 <int>, ahi11 <int>, ahi12 <int>,
#> #   ahi13 <int>, ahi14 <int>, ahi15 <int>, ahi16 <int>, ahi17 <int>,
#> #   ahi18 <int>, ahi19 <int>, ahi20 <int>, ahi21 <int>, ahi22 <int>,
#> #   ahi23 <int>, ahi24 <int>, cesd01 <int>, cesd02 <int>, cesd03 <int>,
#> #   cesd04 <int>, cesd05 <int>, cesd06 <int>, cesd07 <int>, cesd08 <int>,
#> #   cesd09 <int>, cesd10 <int>, cesd11 <int>, cesd12 <int>, cesd13 <int>,
#> #   cesd14 <int>, cesd15 <int>, cesd16 <int>, cesd17 <int>, cesd18 <int>,
#> #   cesd19 <int>, cesd20 <int>, ahiTotal <int>, cesdTotal <int>

copy <- copy %>% filter(!(id == 8) | !(occasion == 2) | !(elapsed.days > 30))  # remove 1 line
copy  # 991 lines remaining (qed).
#> # A tibble: 991 x 50
#>       id occasion elapsed.days intervention ahi01 ahi02 ahi03 ahi04 ahi05
#>    <int>    <dbl>        <dbl>        <int> <int> <int> <int> <int> <int>
#>  1     1        0         0               4     2     3     2     3     3
#>  2     1        1        11.8             4     3     3     4     3     3
#>  3     2        0         0               1     3     4     3     4     2
#>  4     2        1         8.02            1     3     4     4     4     3
#>  5     2        2        14.3             1     3     4     4     4     3
#>  6     2        3        32.0             1     3     4     4     4     4
#>  7     2        4        92.2             1     3     3     2     3     3
#>  8     2        5       182.              1     3     3     3     4     2
#>  9     3        0         0               4     3     3     2     4     2
#> 10     3        2        16.4             4     3     3     3     4     4
#> # ... with 981 more rows, and 41 more variables: ahi06 <int>, ahi07 <int>,
#> #   ahi08 <int>, ahi09 <int>, ahi10 <int>, ahi11 <int>, ahi12 <int>,
#> #   ahi13 <int>, ahi14 <int>, ahi15 <int>, ahi16 <int>, ahi17 <int>,
#> #   ahi18 <int>, ahi19 <int>, ahi20 <int>, ahi21 <int>, ahi22 <int>,
#> #   ahi23 <int>, ahi24 <int>, cesd01 <int>, cesd02 <int>, cesd03 <int>,
#> #   cesd04 <int>, cesd05 <int>, cesd06 <int>, cesd07 <int>, cesd08 <int>,
#> #   cesd09 <int>, cesd10 <int>, cesd11 <int>, cesd12 <int>, cesd13 <int>,
#> #   cesd14 <int>, cesd15 <int>, cesd16 <int>, cesd17 <int>, cesd18 <int>,
#> #   cesd19 <int>, cesd20 <int>, ahiTotal <int>, cesdTotal <int>


# id == 64: ----- 

AHI_CESD %>% 
  filter(id == 64) %>%
  select(id:intervention, ahiTotal, cesdTotal)
#> # A tibble: 6 x 6
#>      id occasion elapsed.days intervention ahiTotal cesdTotal
#>   <int>    <int>        <dbl>        <int>    <int>     <int>
#> 1    64        0          0              1       61        27
#> 2    64        1         10.2            1       61        22
#> 3    64        2         19.0            1       66        20
#> 4    64        4        103.             1       61        19
#> 5    64        4        140.             1       65        16
#> 6    64        5        185.             1       67        19

# Interpretation:
# 6 measurements, but only 5 occasion values (Occasion 4 twice):
# However, elapsed.days are in order and 1st instance of Occasion 4 too late for Occasion 3 (and after start of Occasion 4).

# Decision:
# Correct copy by removing 2nd instance of Occasion 4, but leave the rest as is.
# Alternatively, we could select a random instance or average the total values of both Occasion 4 instances. 

copy  # current copy (991 rows)
#> # A tibble: 991 x 50
#>       id occasion elapsed.days intervention ahi01 ahi02 ahi03 ahi04 ahi05
#>    <int>    <dbl>        <dbl>        <int> <int> <int> <int> <int> <int>
#>  1     1        0         0               4     2     3     2     3     3
#>  2     1        1        11.8             4     3     3     4     3     3
#>  3     2        0         0               1     3     4     3     4     2
#>  4     2        1         8.02            1     3     4     4     4     3
#>  5     2        2        14.3             1     3     4     4     4     3
#>  6     2        3        32.0             1     3     4     4     4     4
#>  7     2        4        92.2             1     3     3     2     3     3
#>  8     2        5       182.              1     3     3     3     4     2
#>  9     3        0         0               4     3     3     2     4     2
#> 10     3        2        16.4             4     3     3     3     4     4
#> # ... with 981 more rows, and 41 more variables: ahi06 <int>, ahi07 <int>,
#> #   ahi08 <int>, ahi09 <int>, ahi10 <int>, ahi11 <int>, ahi12 <int>,
#> #   ahi13 <int>, ahi14 <int>, ahi15 <int>, ahi16 <int>, ahi17 <int>,
#> #   ahi18 <int>, ahi19 <int>, ahi20 <int>, ahi21 <int>, ahi22 <int>,
#> #   ahi23 <int>, ahi24 <int>, cesd01 <int>, cesd02 <int>, cesd03 <int>,
#> #   cesd04 <int>, cesd05 <int>, cesd06 <int>, cesd07 <int>, cesd08 <int>,
#> #   cesd09 <int>, cesd10 <int>, cesd11 <int>, cesd12 <int>, cesd13 <int>,
#> #   cesd14 <int>, cesd15 <int>, cesd16 <int>, cesd17 <int>, cesd18 <int>,
#> #   cesd19 <int>, cesd20 <int>, ahiTotal <int>, cesdTotal <int>
copy %>% filter(id == 64)  # original lines
#> # A tibble: 6 x 50
#>      id occasion elapsed.days intervention ahi01 ahi02 ahi03 ahi04 ahi05
#>   <int>    <dbl>        <dbl>        <int> <int> <int> <int> <int> <int>
#> 1    64        0          0              1     2     2     2     3     2
#> 2    64        1         10.2            1     3     2     2     3     3
#> 3    64        2         19.0            1     3     3     2     3     2
#> 4    64        4        103.             1     2     3     2     3     2
#> 5    64        4        140.             1     3     3     2     3     2
#> 6    64        5        185.             1     2     2     3     4     2
#> # ... with 41 more variables: ahi06 <int>, ahi07 <int>, ahi08 <int>,
#> #   ahi09 <int>, ahi10 <int>, ahi11 <int>, ahi12 <int>, ahi13 <int>,
#> #   ahi14 <int>, ahi15 <int>, ahi16 <int>, ahi17 <int>, ahi18 <int>,
#> #   ahi19 <int>, ahi20 <int>, ahi21 <int>, ahi22 <int>, ahi23 <int>,
#> #   ahi24 <int>, cesd01 <int>, cesd02 <int>, cesd03 <int>, cesd04 <int>,
#> #   cesd05 <int>, cesd06 <int>, cesd07 <int>, cesd08 <int>, cesd09 <int>,
#> #   cesd10 <int>, cesd11 <int>, cesd12 <int>, cesd13 <int>, cesd14 <int>,
#> #   cesd15 <int>, cesd16 <int>, cesd17 <int>, cesd18 <int>, cesd19 <int>,
#> #   cesd20 <int>, ahiTotal <int>, cesdTotal <int>
copy %>% filter(id == 64, occasion == 4, elapsed.days > 130)  # 1 line to be removed
#> # A tibble: 1 x 50
#>      id occasion elapsed.days intervention ahi01 ahi02 ahi03 ahi04 ahi05
#>   <int>    <dbl>        <dbl>        <int> <int> <int> <int> <int> <int>
#> 1    64        4         140.            1     3     3     2     3     2
#> # ... with 41 more variables: ahi06 <int>, ahi07 <int>, ahi08 <int>,
#> #   ahi09 <int>, ahi10 <int>, ahi11 <int>, ahi12 <int>, ahi13 <int>,
#> #   ahi14 <int>, ahi15 <int>, ahi16 <int>, ahi17 <int>, ahi18 <int>,
#> #   ahi19 <int>, ahi20 <int>, ahi21 <int>, ahi22 <int>, ahi23 <int>,
#> #   ahi24 <int>, cesd01 <int>, cesd02 <int>, cesd03 <int>, cesd04 <int>,
#> #   cesd05 <int>, cesd06 <int>, cesd07 <int>, cesd08 <int>, cesd09 <int>,
#> #   cesd10 <int>, cesd11 <int>, cesd12 <int>, cesd13 <int>, cesd14 <int>,
#> #   cesd15 <int>, cesd16 <int>, cesd17 <int>, cesd18 <int>, cesd19 <int>,
#> #   cesd20 <int>, ahiTotal <int>, cesdTotal <int>
copy %>% filter(!(id == 64) | !(occasion == 4) | !(elapsed.days > 130))  # the other 990 lines
#> # A tibble: 990 x 50
#>       id occasion elapsed.days intervention ahi01 ahi02 ahi03 ahi04 ahi05
#>    <int>    <dbl>        <dbl>        <int> <int> <int> <int> <int> <int>
#>  1     1        0         0               4     2     3     2     3     3
#>  2     1        1        11.8             4     3     3     4     3     3
#>  3     2        0         0               1     3     4     3     4     2
#>  4     2        1         8.02            1     3     4     4     4     3
#>  5     2        2        14.3             1     3     4     4     4     3
#>  6     2        3        32.0             1     3     4     4     4     4
#>  7     2        4        92.2             1     3     3     2     3     3
#>  8     2        5       182.              1     3     3     3     4     2
#>  9     3        0         0               4     3     3     2     4     2
#> 10     3        2        16.4             4     3     3     3     4     4
#> # ... with 980 more rows, and 41 more variables: ahi06 <int>, ahi07 <int>,
#> #   ahi08 <int>, ahi09 <int>, ahi10 <int>, ahi11 <int>, ahi12 <int>,
#> #   ahi13 <int>, ahi14 <int>, ahi15 <int>, ahi16 <int>, ahi17 <int>,
#> #   ahi18 <int>, ahi19 <int>, ahi20 <int>, ahi21 <int>, ahi22 <int>,
#> #   ahi23 <int>, ahi24 <int>, cesd01 <int>, cesd02 <int>, cesd03 <int>,
#> #   cesd04 <int>, cesd05 <int>, cesd06 <int>, cesd07 <int>, cesd08 <int>,
#> #   cesd09 <int>, cesd10 <int>, cesd11 <int>, cesd12 <int>, cesd13 <int>,
#> #   cesd14 <int>, cesd15 <int>, cesd16 <int>, cesd17 <int>, cesd18 <int>,
#> #   cesd19 <int>, cesd20 <int>, ahiTotal <int>, cesdTotal <int>

copy <- copy %>% filter(!(id == 64) | !(occasion == 4) | !(elapsed.days > 130))  # remove 1 line
copy  # 990 lines remaining (qed).
#> # A tibble: 990 x 50
#>       id occasion elapsed.days intervention ahi01 ahi02 ahi03 ahi04 ahi05
#>    <int>    <dbl>        <dbl>        <int> <int> <int> <int> <int> <int>
#>  1     1        0         0               4     2     3     2     3     3
#>  2     1        1        11.8             4     3     3     4     3     3
#>  3     2        0         0               1     3     4     3     4     2
#>  4     2        1         8.02            1     3     4     4     4     3
#>  5     2        2        14.3             1     3     4     4     4     3
#>  6     2        3        32.0             1     3     4     4     4     4
#>  7     2        4        92.2             1     3     3     2     3     3
#>  8     2        5       182.              1     3     3     3     4     2
#>  9     3        0         0               4     3     3     2     4     2
#> 10     3        2        16.4             4     3     3     3     4     4
#> # ... with 980 more rows, and 41 more variables: ahi06 <int>, ahi07 <int>,
#> #   ahi08 <int>, ahi09 <int>, ahi10 <int>, ahi11 <int>, ahi12 <int>,
#> #   ahi13 <int>, ahi14 <int>, ahi15 <int>, ahi16 <int>, ahi17 <int>,
#> #   ahi18 <int>, ahi19 <int>, ahi20 <int>, ahi21 <int>, ahi22 <int>,
#> #   ahi23 <int>, ahi24 <int>, cesd01 <int>, cesd02 <int>, cesd03 <int>,
#> #   cesd04 <int>, cesd05 <int>, cesd06 <int>, cesd07 <int>, cesd08 <int>,
#> #   cesd09 <int>, cesd10 <int>, cesd11 <int>, cesd12 <int>, cesd13 <int>,
#> #   cesd14 <int>, cesd15 <int>, cesd16 <int>, cesd17 <int>, cesd18 <int>,
#> #   cesd19 <int>, cesd20 <int>, ahiTotal <int>, cesdTotal <int>


# id == 232: ----- 

AHI_CESD %>% 
  filter(id == 232) %>%
  select(id:intervention, ahiTotal, cesdTotal)
#> # A tibble: 5 x 6
#>      id occasion elapsed.days intervention ahiTotal cesdTotal
#>   <int>    <int>        <dbl>        <int>    <int>     <int>
#> 1   232        0          0              1       64        10
#> 2   232        2         48.8            1       79         4
#> 3   232        3         61.8            1       78         3
#> 4   232        4        123.             1       77        10
#> 5   232        5        224.             1       81         7

# Interpretation:
# 5 measurements, Occasion 1 missing.
# However, elapsed.days are in order and correspond to stated occasions 
# (with the participant being consistently late from Occasion 2 onwards).

# Decision: 
# Leave as is.

Correct or repair data file for 3 participants and save corrected data as a copy (done above).

Compare corrected and original data

Compare corrected copy with original data (AHI_CESD) and save and re-load copy (as AHI_CESD_2):

## Data: ----- 
# AHI_CESD
# copy

# Dimensions: ----- 
dim(AHI_CESD) # 992 rows
#> [1] 992  50
dim(copy)     # 990 rows (qed)
#> [1] 990  50

# Compare details of original data with corrected data in copy: ----- 

AHI_CESD %>% filter(id == 8)
#> # A tibble: 6 x 50
#>      id occasion elapsed.days intervention ahi01 ahi02 ahi03 ahi04 ahi05
#>   <int>    <int>        <dbl>        <int> <int> <int> <int> <int> <int>
#> 1     8        0         0               2     3     2     2     2     2
#> 2     8        1         8.80            2     2     2     1     2     2
#> 3     8        2        21.9             2     2     1     1     1     1
#> 4     8        2        38.9             2     2     2     1     1     2
#> 5     8        3        50.6             2     3     2     1     2     1
#> 6     8        4       121.              2     2     1     2     2     1
#> # ... with 41 more variables: ahi06 <int>, ahi07 <int>, ahi08 <int>,
#> #   ahi09 <int>, ahi10 <int>, ahi11 <int>, ahi12 <int>, ahi13 <int>,
#> #   ahi14 <int>, ahi15 <int>, ahi16 <int>, ahi17 <int>, ahi18 <int>,
#> #   ahi19 <int>, ahi20 <int>, ahi21 <int>, ahi22 <int>, ahi23 <int>,
#> #   ahi24 <int>, cesd01 <int>, cesd02 <int>, cesd03 <int>, cesd04 <int>,
#> #   cesd05 <int>, cesd06 <int>, cesd07 <int>, cesd08 <int>, cesd09 <int>,
#> #   cesd10 <int>, cesd11 <int>, cesd12 <int>, cesd13 <int>, cesd14 <int>,
#> #   cesd15 <int>, cesd16 <int>, cesd17 <int>, cesd18 <int>, cesd19 <int>,
#> #   cesd20 <int>, ahiTotal <int>, cesdTotal <int>
copy %>% filter(id == 8)      # 1 row dropped
#> # A tibble: 5 x 50
#>      id occasion elapsed.days intervention ahi01 ahi02 ahi03 ahi04 ahi05
#>   <int>    <dbl>        <dbl>        <int> <int> <int> <int> <int> <int>
#> 1     8        0         0               2     3     2     2     2     2
#> 2     8        1         8.80            2     2     2     1     2     2
#> 3     8        2        21.9             2     2     1     1     1     1
#> 4     8        3        50.6             2     3     2     1     2     1
#> 5     8        4       121.              2     2     1     2     2     1
#> # ... with 41 more variables: ahi06 <int>, ahi07 <int>, ahi08 <int>,
#> #   ahi09 <int>, ahi10 <int>, ahi11 <int>, ahi12 <int>, ahi13 <int>,
#> #   ahi14 <int>, ahi15 <int>, ahi16 <int>, ahi17 <int>, ahi18 <int>,
#> #   ahi19 <int>, ahi20 <int>, ahi21 <int>, ahi22 <int>, ahi23 <int>,
#> #   ahi24 <int>, cesd01 <int>, cesd02 <int>, cesd03 <int>, cesd04 <int>,
#> #   cesd05 <int>, cesd06 <int>, cesd07 <int>, cesd08 <int>, cesd09 <int>,
#> #   cesd10 <int>, cesd11 <int>, cesd12 <int>, cesd13 <int>, cesd14 <int>,
#> #   cesd15 <int>, cesd16 <int>, cesd17 <int>, cesd18 <int>, cesd19 <int>,
#> #   cesd20 <int>, ahiTotal <int>, cesdTotal <int>

AHI_CESD %>% filter(id == 64)
#> # A tibble: 6 x 50
#>      id occasion elapsed.days intervention ahi01 ahi02 ahi03 ahi04 ahi05
#>   <int>    <int>        <dbl>        <int> <int> <int> <int> <int> <int>
#> 1    64        0          0              1     2     2     2     3     2
#> 2    64        1         10.2            1     3     2     2     3     3
#> 3    64        2         19.0            1     3     3     2     3     2
#> 4    64        4        103.             1     2     3     2     3     2
#> 5    64        4        140.             1     3     3     2     3     2
#> 6    64        5        185.             1     2     2     3     4     2
#> # ... with 41 more variables: ahi06 <int>, ahi07 <int>, ahi08 <int>,
#> #   ahi09 <int>, ahi10 <int>, ahi11 <int>, ahi12 <int>, ahi13 <int>,
#> #   ahi14 <int>, ahi15 <int>, ahi16 <int>, ahi17 <int>, ahi18 <int>,
#> #   ahi19 <int>, ahi20 <int>, ahi21 <int>, ahi22 <int>, ahi23 <int>,
#> #   ahi24 <int>, cesd01 <int>, cesd02 <int>, cesd03 <int>, cesd04 <int>,
#> #   cesd05 <int>, cesd06 <int>, cesd07 <int>, cesd08 <int>, cesd09 <int>,
#> #   cesd10 <int>, cesd11 <int>, cesd12 <int>, cesd13 <int>, cesd14 <int>,
#> #   cesd15 <int>, cesd16 <int>, cesd17 <int>, cesd18 <int>, cesd19 <int>,
#> #   cesd20 <int>, ahiTotal <int>, cesdTotal <int>
copy %>% filter(id == 64)     # 1 row dropped
#> # A tibble: 5 x 50
#>      id occasion elapsed.days intervention ahi01 ahi02 ahi03 ahi04 ahi05
#>   <int>    <dbl>        <dbl>        <int> <int> <int> <int> <int> <int>
#> 1    64        0          0              1     2     2     2     3     2
#> 2    64        1         10.2            1     3     2     2     3     3
#> 3    64        2         19.0            1     3     3     2     3     2
#> 4    64        4        103.             1     2     3     2     3     2
#> 5    64        5        185.             1     2     2     3     4     2
#> # ... with 41 more variables: ahi06 <int>, ahi07 <int>, ahi08 <int>,
#> #   ahi09 <int>, ahi10 <int>, ahi11 <int>, ahi12 <int>, ahi13 <int>,
#> #   ahi14 <int>, ahi15 <int>, ahi16 <int>, ahi17 <int>, ahi18 <int>,
#> #   ahi19 <int>, ahi20 <int>, ahi21 <int>, ahi22 <int>, ahi23 <int>,
#> #   ahi24 <int>, cesd01 <int>, cesd02 <int>, cesd03 <int>, cesd04 <int>,
#> #   cesd05 <int>, cesd06 <int>, cesd07 <int>, cesd08 <int>, cesd09 <int>,
#> #   cesd10 <int>, cesd11 <int>, cesd12 <int>, cesd13 <int>, cesd14 <int>,
#> #   cesd15 <int>, cesd16 <int>, cesd17 <int>, cesd18 <int>, cesd19 <int>,
#> #   cesd20 <int>, ahiTotal <int>, cesdTotal <int>

AHI_CESD %>% filter(id == 275)
#> # A tibble: 3 x 50
#>      id occasion elapsed.days intervention ahi01 ahi02 ahi03 ahi04 ahi05
#>   <int>    <int>        <dbl>        <int> <int> <int> <int> <int> <int>
#> 1   275        0         0               3     3     3     2     4     4
#> 2   275        2         7.18            3     4     4     3     4     4
#> 3   275        3        27.9             3     4     4     3     4     4
#> # ... with 41 more variables: ahi06 <int>, ahi07 <int>, ahi08 <int>,
#> #   ahi09 <int>, ahi10 <int>, ahi11 <int>, ahi12 <int>, ahi13 <int>,
#> #   ahi14 <int>, ahi15 <int>, ahi16 <int>, ahi17 <int>, ahi18 <int>,
#> #   ahi19 <int>, ahi20 <int>, ahi21 <int>, ahi22 <int>, ahi23 <int>,
#> #   ahi24 <int>, cesd01 <int>, cesd02 <int>, cesd03 <int>, cesd04 <int>,
#> #   cesd05 <int>, cesd06 <int>, cesd07 <int>, cesd08 <int>, cesd09 <int>,
#> #   cesd10 <int>, cesd11 <int>, cesd12 <int>, cesd13 <int>, cesd14 <int>,
#> #   cesd15 <int>, cesd16 <int>, cesd17 <int>, cesd18 <int>, cesd19 <int>,
#> #   cesd20 <int>, ahiTotal <int>, cesdTotal <int>
copy %>% filter(id == 275)    # 2 values changed (decremented)
#> # A tibble: 3 x 50
#>      id occasion elapsed.days intervention ahi01 ahi02 ahi03 ahi04 ahi05
#>   <int>    <dbl>        <dbl>        <int> <int> <int> <int> <int> <int>
#> 1   275        0         0               3     3     3     2     4     4
#> 2   275        1         7.18            3     4     4     3     4     4
#> 3   275        2        27.9             3     4     4     3     4     4
#> # ... with 41 more variables: ahi06 <int>, ahi07 <int>, ahi08 <int>,
#> #   ahi09 <int>, ahi10 <int>, ahi11 <int>, ahi12 <int>, ahi13 <int>,
#> #   ahi14 <int>, ahi15 <int>, ahi16 <int>, ahi17 <int>, ahi18 <int>,
#> #   ahi19 <int>, ahi20 <int>, ahi21 <int>, ahi22 <int>, ahi23 <int>,
#> #   ahi24 <int>, cesd01 <int>, cesd02 <int>, cesd03 <int>, cesd04 <int>,
#> #   cesd05 <int>, cesd06 <int>, cesd07 <int>, cesd08 <int>, cesd09 <int>,
#> #   cesd10 <int>, cesd11 <int>, cesd12 <int>, cesd13 <int>, cesd14 <int>,
#> #   cesd15 <int>, cesd16 <int>, cesd17 <int>, cesd18 <int>, cesd19 <int>,
#> #   cesd20 <int>, ahiTotal <int>, cesdTotal <int>

# Save copy data:
AHI_CESD_corrected <- copy 
write_csv(AHI_CESD_corrected, path = "/Users/hneth/Desktop/stuff/Dropbox/_code/R/_teachR/ds4psy/data/posPsy_AHI_CESD_corrected.csv")

# AHI_CESD_2 <- read_csv(file = "/Users/hneth/Desktop/stuff/Dropbox/_code/R/_teachR/ds4psy/data/posPsy_AHI_CESD_corrected.csv")  # local
AHI_CESD_2 <- read_csv(file = "http://rpository.com/ds4psy/data/posPsy_AHI_CESD_corrected.csv")  # online
dim(AHI_CESD_2)  # 990 x 50
#> [1] 990  50

# all.equal(AHI_CESD_corrected, AHI_CESD_2)  # => should be TRUE (except for numeric format of occasion)

Proceed with corrected and re-loaded version of AHI_CESD_2 data!

Inspect corrected data

glimpse(p_info)
#> Observations: 295
#> Variables: 6
#> $ id           <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15...
#> $ intervention <int> 4, 1, 4, 3, 2, 1, 3, 2, 1, 2, 2, 2, 4, 4, 4, 4, 3...
#> $ sex          <int> 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1...
#> $ age          <int> 35, 59, 51, 50, 58, 31, 44, 57, 36, 45, 56, 46, 3...
#> $ educ         <int> 5, 1, 4, 5, 5, 5, 5, 4, 4, 4, 5, 4, 5, 1, 2, 1, 4...
#> $ income       <int> 3, 1, 3, 2, 2, 1, 2, 2, 3, 3, 1, 3, 3, 2, 2, 1, 2...
glimpse(AHI_CESD_2)
#> Observations: 990
#> Variables: 50
#> $ id           <int> 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 4, 4, 4, 4, 5, 5, 5...
#> $ occasion     <int> 0, 1, 0, 1, 2, 3, 4, 5, 0, 2, 0, 1, 3, 4, 0, 1, 2...
#> $ elapsed.days <dbl> 0.000000, 11.772731, 0.000000, 8.017523, 14.30408...
#> $ intervention <int> 4, 4, 1, 1, 1, 1, 1, 1, 4, 4, 3, 3, 3, 3, 2, 2, 2...
#> $ ahi01        <int> 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 1, 3, 1, 1, 2...
#> $ ahi02        <int> 3, 3, 4, 4, 4, 4, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2...
#> $ ahi03        <int> 2, 4, 3, 4, 4, 4, 2, 3, 2, 3, 3, 3, 2, 3, 3, 3, 3...
#> $ ahi04        <int> 3, 3, 4, 4, 4, 4, 3, 4, 4, 4, 2, 4, 4, 4, 2, 2, 2...
#> $ ahi05        <int> 3, 3, 2, 3, 3, 4, 3, 2, 2, 4, 2, 2, 2, 2, 2, 2, 2...
#> $ ahi06        <int> 2, 4, 3, 3, 3, 4, 3, 3, 3, 4, 3, 3, 1, 1, 2, 2, 2...
#> $ ahi07        <int> 3, 4, 4, 4, 4, 4, 3, 3, 4, 4, 3, 3, 2, 3, 1, 1, 2...
#> $ ahi08        <int> 3, 3, 3, 4, 3, 3, 3, 4, 3, 3, 1, 1, 1, 2, 1, 2, 2...
#> $ ahi09        <int> 3, 3, 3, 4, 4, 4, 4, 3, 4, 4, 5, 5, 4, 3, 2, 2, 2...
#> $ ahi10        <int> 2, 2, 3, 3, 4, 4, 4, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2...
#> $ ahi11        <int> 3, 2, 2, 3, 3, 4, 4, 3, 2, 3, 3, 3, 2, 3, 2, 2, 2...
#> $ ahi12        <int> 3, 3, 3, 4, 4, 4, 4, 3, 4, 4, 3, 4, 4, 3, 2, 2, 2...
#> $ ahi13        <int> 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 2, 2, 2...
#> $ ahi14        <int> 2, 3, 3, 4, 4, 4, 3, 3, 3, 3, 1, 1, 1, 1, 2, 1, 1...
#> $ ahi15        <int> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 1, 1, 1...
#> $ ahi16        <int> 3, 3, 3, 4, 4, 4, 4, 3, 4, 4, 1, 2, 2, 2, 2, 2, 2...
#> $ ahi17        <int> 2, 2, 3, 4, 4, 4, 3, 4, 3, 4, 3, 3, 3, 3, 1, 1, 2...
#> $ ahi18        <int> 2, 3, 3, 4, 4, 4, 4, 4, 3, 3, 1, 1, 1, 1, 1, 2, 2...
#> $ ahi19        <int> 3, 3, 3, 4, 4, 4, 4, 2, 4, 4, 3, 3, 4, 3, 2, 2, 2...
#> $ ahi20        <int> 3, 3, 3, 4, 3, 4, 4, 4, 3, 4, 4, 5, 3, 4, 2, 2, 2...
#> $ ahi21        <int> 2, 3, 2, 4, 4, 4, 3, 4, 3, 4, 3, 3, 3, 3, 1, 1, 2...
#> $ ahi22        <int> 2, 3, 2, 3, 4, 4, 3, 3, 3, 4, 2, 2, 1, 2, 1, 1, 2...
#> $ ahi23        <int> 3, 4, 4, 4, 4, 4, 3, 3, 3, 3, 1, 2, 1, 2, 2, 1, 2...
#> $ ahi24        <int> 2, 2, 3, 4, 4, 4, 3, 3, 4, 3, 2, 2, 2, 2, 2, 2, 2...
#> $ cesd01       <int> 2, 2, 1, 3, 1, 1, 2, 2, 1, 1, 2, 1, 1, 3, 1, 1, 2...
#> $ cesd02       <int> 1, 1, 1, 2, 1, 1, 3, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3...
#> $ cesd03       <int> 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 4, 3, 4, 4, 1, 1, 2...
#> $ cesd04       <int> 4, 4, 1, 3, 1, 1, 1, 4, 4, 4, 2, 3, 1, 1, 3, 2, 2...
#> $ cesd05       <int> 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 1, 1, 2, 1, 3, 3, 3...
#> $ cesd06       <int> 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 3, 3, 4, 3, 4, 2, 2...
#> $ cesd07       <int> 1, 2, 1, 2, 1, 1, 2, 1, 1, 1, 3, 3, 4, 2, 3, 3, 3...
#> $ cesd08       <int> 3, 4, 1, 1, 1, 4, 4, 2, 4, 4, 1, 3, 2, 2, 1, 1, 2...
#> $ cesd09       <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 3, 3, 2...
#> $ cesd10       <int> 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 4, 1, 1, 1...
#> $ cesd11       <int> 3, 2, 2, 1, 3, 2, 2, 2, 2, 2, 1, 4, 4, 1, 4, 3, 3...
#> $ cesd12       <int> 2, 4, 4, 3, 4, 1, 3, 4, 4, 4, 2, 2, 1, 1, 2, 1, 1...
#> $ cesd13       <int> 2, 1, 1, 1, 3, 1, 3, 3, 1, 2, 4, 3, 4, 4, 2, 3, 1...
#> $ cesd14       <int> 3, 2, 1, 1, 1, 1, 1, 3, 2, 2, 4, 4, 4, 4, 3, 3, 3...
#> $ cesd15       <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1...
#> $ cesd16       <int> 2, 3, 4, 3, 1, 3, 3, 4, 4, 4, 2, 2, 1, 2, 2, 1, 1...
#> $ cesd17       <int> 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 3, 4, 4, 4, 1, 3, 1...
#> $ cesd18       <int> 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 3, 4, 4, 4, 2, 2, 2...
#> $ cesd19       <int> 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1...
#> $ cesd20       <int> 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 2, 3, 3, 2...
#> $ ahiTotal     <int> 63, 73, 73, 89, 89, 93, 80, 77, 77, 85, 60, 67, 5...
#> $ cesdTotal    <int> 14, 6, 7, 10, 13, 8, 15, 12, 3, 5, 31, 31, 41, 35...

## How many occasions (lines) are there for each participant?
# AHI_CESD_2

occasions_pp <- AHI_CESD_2 %>%
  group_by(id) %>%
  summarise(n = n())

knitr::kable(head(occasions_pp))
id n
1 2
2 6
3 2
4 4
5 5
6 3

## How many participants with all 6 occasions?
occasions_pp %>%
  filter(n == 6) %>%
  nrow()
#> [1] 72

Slice and dice (from long to wide format)

Note that ahi_cesd consists of several blocks of data (one per occasion, including all participants still participating in this occasion).

Split ahi_cesd into 6 parts

Edit ahi_cesd: Filter each occasion into a separate data file and combine them (into a wider format) afterwards:

Goal: Filter 1 block for each occasion, containing 1 row per participant:

# Create individual tibbles for every observation 
#  to later combine them using full_join()

ahi_cesd_0 <- AHI_CESD_2 %>%
  group_by(id, occasion) %>%
  filter(occasion == 0)

ahi_cesd_1 <- AHI_CESD_2 %>%
  group_by(id, occasion) %>%
  filter(occasion == 1)

ahi_cesd_2 <- AHI_CESD_2 %>%
  group_by(id, occasion) %>%
  filter(occasion == 2)

ahi_cesd_3 <- AHI_CESD_2 %>%
  group_by(id, occasion) %>%
  filter(occasion == 3)

ahi_cesd_4 <- AHI_CESD_2 %>%
  group_by(id, occasion) %>%
  filter(occasion == 4)

ahi_cesd_5 <- AHI_CESD_2 %>%
  group_by(id, occasion) %>%
  filter(occasion == 5)

Join all occasions (6 parts)

Now we can join the 6 parts by participant id (in wide format):

# Join parts: 
block_0_to_1 <- full_join(ahi_cesd_0, ahi_cesd_1, by = "id", suffix = c(".0", ".1"))
block_0_to_2 <- full_join(block_0_to_1, ahi_cesd_2, by = "id", suffix = c(".1", ".2"))
block_0_to_3 <- full_join(block_0_to_2, ahi_cesd_3, by = "id", suffix = c(".2", ".3"))
block_0_to_4 <- full_join(block_0_to_3, ahi_cesd_4, by = "id", suffix = c(".3", ".4"))
block_0_to_5 <- full_join(block_0_to_4, ahi_cesd_5, by = "id", suffix = c(".4", ".5"))

## check:
dim(block_0_to_5)   # => 295 x 295
#> [1] 295 295

Join occasions with participant data

Joining participant_info with new_final:

complete_data <- full_join(p_info, block_0_to_5, by = "id")

## Check:
complete_data
#> # A tibble: 295 x 300
#>       id intervention   sex   age  educ income occasion.0 elapsed.days.0
#>    <int>        <int> <int> <int> <int>  <int>      <int>          <dbl>
#>  1     1            4     2    35     5      3          0              0
#>  2     2            1     1    59     1      1          0              0
#>  3     3            4     1    51     4      3          0              0
#>  4     4            3     1    50     5      2          0              0
#>  5     5            2     2    58     5      2          0              0
#>  6     6            1     1    31     5      1          0              0
#>  7     7            3     1    44     5      2          0              0
#>  8     8            2     1    57     4      2          0              0
#>  9     9            1     1    36     4      3          0              0
#> 10    10            2     1    45     4      3          0              0
#> # ... with 285 more rows, and 292 more variables: intervention.0 <int>,
#> #   ahi01.0 <int>, ahi02.0 <int>, ahi03.0 <int>, ahi04.0 <int>,
#> #   ahi05.0 <int>, ahi06.0 <int>, ahi07.0 <int>, ahi08.0 <int>,
#> #   ahi09.0 <int>, ahi10.0 <int>, ahi11.0 <int>, ahi12.0 <int>,
#> #   ahi13.0 <int>, ahi14.0 <int>, ahi15.0 <int>, ahi16.0 <int>,
#> #   ahi17.0 <int>, ahi18.0 <int>, ahi19.0 <int>, ahi20.0 <int>,
#> #   ahi21.0 <int>, ahi22.0 <int>, ahi23.0 <int>, ahi24.0 <int>,
#> #   cesd01.0 <int>, cesd02.0 <int>, cesd03.0 <int>, cesd04.0 <int>,
#> #   cesd05.0 <int>, cesd06.0 <int>, cesd07.0 <int>, cesd08.0 <int>,
#> #   cesd09.0 <int>, cesd10.0 <int>, cesd11.0 <int>, cesd12.0 <int>,
#> #   cesd13.0 <int>, cesd14.0 <int>, cesd15.0 <int>, cesd16.0 <int>,
#> #   cesd17.0 <int>, cesd18.0 <int>, cesd19.0 <int>, cesd20.0 <int>,
#> #   ahiTotal.0 <int>, cesdTotal.0 <int>, occasion.1 <int>,
#> #   elapsed.days.1 <dbl>, intervention.1 <int>, ahi01.1 <int>,
#> #   ahi02.1 <int>, ahi03.1 <int>, ahi04.1 <int>, ahi05.1 <int>,
#> #   ahi06.1 <int>, ahi07.1 <int>, ahi08.1 <int>, ahi09.1 <int>,
#> #   ahi10.1 <int>, ahi11.1 <int>, ahi12.1 <int>, ahi13.1 <int>,
#> #   ahi14.1 <int>, ahi15.1 <int>, ahi16.1 <int>, ahi17.1 <int>,
#> #   ahi18.1 <int>, ahi19.1 <int>, ahi20.1 <int>, ahi21.1 <int>,
#> #   ahi22.1 <int>, ahi23.1 <int>, ahi24.1 <int>, cesd01.1 <int>,
#> #   cesd02.1 <int>, cesd03.1 <int>, cesd04.1 <int>, cesd05.1 <int>,
#> #   cesd06.1 <int>, cesd07.1 <int>, cesd08.1 <int>, cesd09.1 <int>,
#> #   cesd10.1 <int>, cesd11.1 <int>, cesd12.1 <int>, cesd13.1 <int>,
#> #   cesd14.1 <int>, cesd15.1 <int>, cesd16.1 <int>, cesd17.1 <int>,
#> #   cesd18.1 <int>, cesd19.1 <int>, cesd20.1 <int>, ahiTotal.1 <int>,
#> #   cesdTotal.1 <int>, occasion.2 <int>, elapsed.days.2 <dbl>,
#> #   intervention.2 <int>, ahi01.2 <int>, …
dim(complete_data)  # => 295 x 300   
#> [1] 295 300
# complete_data$id[duplicated(complete_data$id)]  # => 8 and 64 NO LONGER appear twice

Using left_join instead:

### Using left_join instead of full_join 
left_0 <- left_join(p_info, ahi_cesd_0, by = "id")
left_1 <- left_join(left_0, ahi_cesd_1, by = "id", suffix = c(".0", ".1"))
left_2 <- left_join(left_1, ahi_cesd_2, by = "id", suffix = c(".1", ".2"))
left_3 <- left_join(left_2, ahi_cesd_3, by = "id", suffix = c(".2", ".3"))
left_4 <- left_join(left_3, ahi_cesd_4, by = "id", suffix = c(".3", ".4"))

complete_data_2 <- left_join(left_4, ahi_cesd_5, by = "id", suffix = c(".4", ".5"))

## Check:
# complete_data_2
dim(complete_data_2)  # => 295 x 300
#> [1] 295 300
# complete_data_2$id[duplicated(complete_data_2$id)]  # => 8 and 64 NO LONGER appear twice

# all.equal(complete_data, complete_data_2) # TRUE (except for column names)

Proceed with complete_data.

Check for and remove redundancies

Delete redundant intervention.* columns, only retaining the column intervention:

dim(complete_data)  # 295 x 300 
#> [1] 295 300

all.equal(complete_data$intervention, complete_data$intervention.0)
#> [1] TRUE
all.equal(complete_data$intervention, complete_data$intervention.1)  # only is.NA value mismatches (due to drop outs)
#> [1] "'is.NA' value mismatch: 147 in current 0 in target"
all.equal(complete_data$intervention, complete_data$intervention.2)  # only is.NA value mismatches (due to drop outs)
#> [1] "'is.NA' value mismatch: 139 in current 0 in target"
all.equal(complete_data$intervention, complete_data$intervention.3)  # only is.NA value mismatches (due to drop outs)
#> [1] "'is.NA' value mismatch: 157 in current 0 in target"
all.equal(complete_data$intervention, complete_data$intervention.4)  # only is.NA value mismatches (due to drop outs)
#> [1] "'is.NA' value mismatch: 162 in current 0 in target"
all.equal(complete_data$intervention, complete_data$intervention.5)  # only is.NA value mismatches (due to drop outs)
#> [1] "'is.NA' value mismatch: 175 in current 0 in target"

## Remove redundant columns:
complete_data <- select(complete_data, -intervention.0, -intervention.1, -intervention.2, 
                                       -intervention.3, -intervention.4, -intervention.5)
dim(complete_data)  # 295 x 294 
#> [1] 295 294

Save complete_data as posPsy_data_wide.csv and re-load as data_wide:

## Save data files:
write_csv(complete_data, path = "/Users/hneth/Desktop/stuff/Dropbox/_code/R/_teachR/ds4psy/data/posPsy_data_wide.csv")

## Restore data files:
# data_wide <- read_csv(file = "/Users/hneth/Desktop/stuff/Dropbox/_code/R/_teachR/ds4psy/data/posPsy_data_wide.csv")
data_wide <- read_csv(file = "http://rpository.com/ds4psy/data/posPsy_data_wide.csv")  # online
data_wide
#> # A tibble: 295 x 294
#>       id intervention   sex   age  educ income occasion.0 elapsed.days.0
#>    <int>        <int> <int> <int> <int>  <int>      <int>          <int>
#>  1     1            4     2    35     5      3          0              0
#>  2     2            1     1    59     1      1          0              0
#>  3     3            4     1    51     4      3          0              0
#>  4     4            3     1    50     5      2          0              0
#>  5     5            2     2    58     5      2          0              0
#>  6     6            1     1    31     5      1          0              0
#>  7     7            3     1    44     5      2          0              0
#>  8     8            2     1    57     4      2          0              0
#>  9     9            1     1    36     4      3          0              0
#> 10    10            2     1    45     4      3          0              0
#> # ... with 285 more rows, and 286 more variables: ahi01.0 <int>,
#> #   ahi02.0 <int>, ahi03.0 <int>, ahi04.0 <int>, ahi05.0 <int>,
#> #   ahi06.0 <int>, ahi07.0 <int>, ahi08.0 <int>, ahi09.0 <int>,
#> #   ahi10.0 <int>, ahi11.0 <int>, ahi12.0 <int>, ahi13.0 <int>,
#> #   ahi14.0 <int>, ahi15.0 <int>, ahi16.0 <int>, ahi17.0 <int>,
#> #   ahi18.0 <int>, ahi19.0 <int>, ahi20.0 <int>, ahi21.0 <int>,
#> #   ahi22.0 <int>, ahi23.0 <int>, ahi24.0 <int>, cesd01.0 <int>,
#> #   cesd02.0 <int>, cesd03.0 <int>, cesd04.0 <int>, cesd05.0 <int>,
#> #   cesd06.0 <int>, cesd07.0 <int>, cesd08.0 <int>, cesd09.0 <int>,
#> #   cesd10.0 <int>, cesd11.0 <int>, cesd12.0 <int>, cesd13.0 <int>,
#> #   cesd14.0 <int>, cesd15.0 <int>, cesd16.0 <int>, cesd17.0 <int>,
#> #   cesd18.0 <int>, cesd19.0 <int>, cesd20.0 <int>, ahiTotal.0 <int>,
#> #   cesdTotal.0 <int>, occasion.1 <int>, elapsed.days.1 <dbl>,
#> #   ahi01.1 <int>, ahi02.1 <int>, ahi03.1 <int>, ahi04.1 <int>,
#> #   ahi05.1 <int>, ahi06.1 <int>, ahi07.1 <int>, ahi08.1 <int>,
#> #   ahi09.1 <int>, ahi10.1 <int>, ahi11.1 <int>, ahi12.1 <int>,
#> #   ahi13.1 <int>, ahi14.1 <int>, ahi15.1 <int>, ahi16.1 <int>,
#> #   ahi17.1 <int>, ahi18.1 <int>, ahi19.1 <int>, ahi20.1 <int>,
#> #   ahi21.1 <int>, ahi22.1 <int>, ahi23.1 <int>, ahi24.1 <int>,
#> #   cesd01.1 <int>, cesd02.1 <int>, cesd03.1 <int>, cesd04.1 <int>,
#> #   cesd05.1 <int>, cesd06.1 <int>, cesd07.1 <int>, cesd08.1 <int>,
#> #   cesd09.1 <int>, cesd10.1 <int>, cesd11.1 <int>, cesd12.1 <int>,
#> #   cesd13.1 <int>, cesd14.1 <int>, cesd15.1 <int>, cesd16.1 <int>,
#> #   cesd17.1 <int>, cesd18.1 <int>, cesd19.1 <int>, cesd20.1 <int>,
#> #   ahiTotal.1 <int>, cesdTotal.1 <int>, occasion.2 <int>,
#> #   elapsed.days.2 <dbl>, ahi01.2 <int>, ahi02.2 <int>, ahi03.2 <int>,
#> #   ahi04.2 <int>, …
all.equal(data_wide, complete_data)  # TRUE (except for numeric type of elapsed.days.0)
#> [1] "Incompatible type for column `elapsed.days.0`: x integer, y numeric"

# all_data$id[duplicated(data_wide$id)]  # => 8 and 64 NO LONGER appear twice

Proceed with data_wide.

+++ here now +++

Datasets

Start from scratch

# Housekeeping:
rm(list = ls())  # cleans ALL objects in current R environment (without asking for confirmation)!

# Load packages:
library(tidyverse)
library(knitr)
library(rmarkdown)

# Other customizations:
seeblau <- rgb(0, 169, 224, names = "seeblau", maxColorValue = 255)  # seeblau.4 of uni.kn color scheme 

Getting the data

Some notes on the current data versions and availability of the corresponding files (from file datasets.Rmd).

Files available

The following files were generated from the original data files (and saved in .csv format):

  1. posPsy_participants.csv: Original participant data (295 x 6 variables): http://rpository.com/ds4psy/data/posPsy_participants.csv.

  2. posPsy_AHI_CESD.csv: Original data of dependent measures in long format (992 x 50 variables): http://rpository.com/ds4psy/data/posPsy_AHI_CESD.csv.

  3. posPsy_AHI_CESD_corrected.csv: Corrected version of dependent measures in long format (990 x 50 variables): http://rpository.com/ds4psy/data/posPsy_AHI_CESD_corrected.csv.

  4. posPsy_data_wide.csv: Corrected version of all data joined in wide format (295 x 294 variables): http://rpository.com/ds4psy/data/posPsy_data_wide.csv. Different measurement occasions are suffixed by .0, .1, …, .5.

We can load the participant data into R with the following command (from the package readr, which is part of the tidyverse):

Loading data

Reading in these data files from online sources:

library(readr)

# 1. Participant data: 
p_info <- read_csv(file = "http://rpository.com/ds4psy/data/posPsy_participants.csv")  # online
dim(p_info)  # 295 x 6 
#> [1] 295   6

# 2. Original DVs in long format:
AHI_CESD <- read_csv(file = "http://rpository.com/ds4psy/data/posPsy_AHI_CESD.csv")  # online
dim(AHI_CESD)  # 992 x 50
#> [1] 992  50

# 3. Corrected DVs in long format:
AHI_CESD_2 <- read_csv(file = "http://rpository.com/ds4psy/data/posPsy_AHI_CESD_corrected.csv")  # online
dim(AHI_CESD_2)  # 990 x 50
#> [1] 990  50

# 4. Corrected version of all data in wide format: 
posPsy_wide <- readr::read_csv(file = "http://rpository.com/ds4psy/data/posPsy_data_wide.csv")
dim(posPsy_wide)  # 295 cases x 294 variables
#> [1] 295 294

Data visualisation

Some exercises using the ggplot2 package.

ToDo: Put on separate page (and read in required data files).

Exercise 1: Explore age

# Exercise 1: (use p_info data set)

# (a) Create a histogramm showing the overall distribution of age 
ggplot(p_info) +
  geom_histogram(mapping = aes(age), binwidth = 2, fill = "forestgreen") +
  theme_bw() +
  labs(title = "Distribution of age values")


# (b) Can you display the distribution also as frequency polygon?
ggplot(data = p_info) +
  geom_freqpoly(mapping = aes(x=age), binwidth = 4, fill = "forestgreen", color = "grey") + 
  labs(title = "Distribution of age",
       x="Age", y = "Count") +
  theme_bw()

  
# (c) Create a stacked histogramm showing the distribution of age by intervention
ggplot(p_info) +
  geom_histogram(mapping = aes(age,  fill = intervention, position = "stack"), binwidth = 3) +
  theme_bw() +
  labs(title = "Distribution of age values by intervention")


# (d) Create 4 histogramms showing the distribution of age by intervention 
ggplot(p_info) +
  geom_histogram(mapping = aes(age), binwidth = 5, fill = "forestgreen") +
  theme_bw() +
  labs(title = "Distribution of age values by intervention") +
  facet_grid(.~intervention)


# (e) Create a histogramm and a frequency polygon showing how age is distributed amongst the male and female participants
p_info$sex <- as.factor(p_info$sex)

ggplot(data = p_info) +
  geom_histogram(mapping = aes(x=age, fill = sex), binwidth = 5) +
  labs(title = "Distribution of age",
       x="Age", y = "Count") +
  theme_bw()


ggplot(data = p_info) +
  geom_freqpoly(mapping = aes(x=age, color = sex), binwidth = 4) +
  labs(title = "Distribution of age",
       x="Age", y = "Count") +
  theme_bw()

Exercise 2: Explore participants - intervention groups

# Exercise 2: Are the participants equally dirtributed over the 4 intervention categories?
# (a): in total counts
# (b): in %

# # use p_info dataset
# # load data
# p_info <- read_csv(file = "http://rpository.com/ds4psy/data/posPsy_participants.csv")  
# dim(p_info)  # 295 x 6 

## (2a)
p_info$intervention <- as.factor(p_info$intervention)
ggplot(p_info) +
  geom_bar(mapping = aes(x = intervention),  fill = "darkblue") + 
  labs(title = "Participants per intervention group", x="Intervention", y = "Count") +
  theme_minimal()



# (additional task): Every bar should have a different color.How do you do this?

ggplot(p_info) +
  geom_bar(mapping = aes(x = intervention, fill = intervention)) + 
  labs(title = "Participants per intervention group", x="Intervention", y = "Count") +
  theme_minimal()



## (2b)
ggplot(p_info) +
  geom_bar(mapping = aes(x = intervention,y = ..prop.., group = 1), fill = "darkblue") +   labs(title = "Distribution of participants over intervention groups", x="Intervention", y = "Proportion") +
  theme_minimal()

Exercise 3: Gender per intervention

Is the distribution of gender similar within the intervention groups?

# Exercise 3: Is the distribution of gender similar within the intervention groups?

ggplot(p_info) +
  geom_bar(mapping = aes(x = intervention, fill = sex), position = "dodge") + 
  labs(title = "Participants per intervention group", x="Intervention", y = "Count") +
  theme_minimal()

Exercise 4: Scores

Explore ahiTotal and cesdTotal scores at different occasions.

# Exercise 4

# load data
data_wide <- read_csv(file = "http://rpository.com/ds4psy/data/posPsy_data_wide.csv") 
dim(data_wide)  # 295 x 294 
#> [1] 295 294

## (4a): Show the relationship of ahiTotal and cesdTotal at Occasion 0 using a scatterplot

ggplot(data = data_wide) +
  geom_point(mapping = aes(x = ahiTotal.0, y = cesdTotal.0)) + 
  #adding title and axis labels
  labs(title = "Relationship between ahiTotal and cesdTotal at Occasion = 0", 
       x ="ahiTotal", y = "cesdTotal") +
  theme_minimal()



## (4b): Add the trend-line to this plot
ggplot(data = data_wide, mapping = aes(x = ahiTotal.0, y = cesdTotal.0)) +
  geom_point() + 
  geom_smooth() +
  labs(title = "Relationship between ahiTotal and cesdTotal at Occasion = 0", 
       x ="ahiTotal", y = "cesdTotal") +
  theme_minimal()


## (4c): Make this plot look nicer, for example by using color or changing the pointshape or size.

ggplot(data = data_wide) +
  geom_point(mapping = aes(x = ahiTotal.0, y = cesdTotal.0),color = "darkblue", shape = 18, size=3, alpha = .4) + 
  labs(title = "Relationship between ahiTotal and cesdTotal at Occasion = 0", 
       x ="ahiTotal", y = "cesdTotal") +
  theme_minimal()


  
## (4d): Add information on intervention group using 2 different options

##prep: intervention as factor not as integer
data_wide$intervention <- as.factor(data_wide$intervention)

## 1 Plot, 4 colors: 
 ggplot(data = data_wide) +
  geom_point(mapping = aes(x = ahiTotal.0, y = cesdTotal.0, color = intervention)) + 
  labs(title = "Relationship between ahiTotal and cesdTotal at Occasion = 0",
       x="ahiTotal", y = "cesdTotal") 

 
## 4 plots: 
ggplot(data = data_wide) +
  geom_point(mapping = aes(x = ahiTotal.0, y = cesdTotal.0)) + 
  labs(title = "Relationship between ahiTotal and cesdTotal at Occasion = 0",
       x="ahiTotal", y = "cesdTotal") +
    theme_minimal() +
    facet_grid(.~ intervention)

  
 
## (4e): Let's create 4 plots (hint: facet_grid) for occasion 5 and compare them to the one for occasion 0. What has changed?
  
# Relationship ahiTotal and cesdTotal at Occasion = 5 
 
ggplot(data = data_wide) +
  geom_point(mapping = aes(x = ahiTotal.5, y = cesdTotal.5)) + 
  labs(title = "Relationship between ahiTotal and cesdTotal at Occasion = 5",
       x="ahiTotal", y = "cesdTotal") +
      theme_minimal() +
    facet_grid(.~ intervention)

    
ggplot(data = data_wide) +
  geom_point(mapping = aes(x = ahiTotal.5, y = cesdTotal.5, color = intervention)) + 
  labs(title = "Relationship between ahiTotal and cesdTotal at Occasion = 5",
       x="ahiTotal", y = "cesdTotal") +
          theme_minimal()

Exercise 5: Violins

# Exercise 5:

## load data:
# data_wide <- read_csv(file = "http://rpository.com/ds4psy/data/posPsy_data_wide.csv") 
# dim(data_wide)  # 295 x 294 

## (5a): Plot the distribution of pre-intervention ahiTotal-scores (Occasion = 0) grouped by income level using a violin plot
data_wide$income <- as.factor(data_wide$income)
ggplot(data = data_wide, mapping = aes(x = income,y = ahiTotal.0)) +
  geom_violin() +
  labs(title = "Comparison of pre-intervention ahiTotal-scores at for income levels", x = "income category", y = "pre-intervention ahiTotal-scores" ) +
  theme_classic()


## (5b): Can you add information about how the participants are distributed with in each income group using geom_jitter? 

ggplot(data = data_wide, mapping = aes(x = income,y = ahiTotal.0)) +
  geom_violin(color = "darkblue") +
  geom_jitter(color = "lightblue", size = 2) +
  labs(title = "Comparison of pre-intervention ahiTotal-scores at for income levels", x = "income category", y = "pre-intervention ahiTotal-scores" ) +
  theme_classic()


## (5c): Can you do the same plot with geom_point, instead of geom_jitter?
ggplot(data = data_wide, mapping = aes(x = income,y = ahiTotal.0)) +
  geom_violin() +
  geom_point(position = "jitter", color = "lightblue", size = 2) +
  labs(title = "Comparison of pre-intervention ahiTotal-scores at for income levels", x = "income category", y = "pre-intervention ahiTotal-scores" ) +
  theme_classic()


## (5d): Let?s see how the pre-intervention cesdTotal-scores are distributed in the income groups
ggplot(data = data_wide, mapping = aes(x = income,y = cesdTotal.0)) +
  geom_violin(color = "darkblue") +
  geom_jitter(color = "lightblue", size = 2) +
  labs(title = "Comparison of pre-intervention cesdTotal-scores at for income levels", x = "income category", y = "pre-intervention cesdTotal-scores" ) +
  theme_classic()

Exercise 6: Scatterplots

Explore data - ahiTotal scores, cesdTotal scores and age

# Exercise 6: Create two scatterplot to investigate if there is an systematic relationship between ahiTotal scores/cesdTotal scores and age.

## load data:
# data_wide <- read_csv(file = "http://rpository.com/ds4psy/data/posPsy_data_wide.csv") 
# dim(data_wide)  # 295 x 294

## (6a): ahiTotal scores
ggplot(data_wide, mapping = aes(x = age, y = ahiTotal.0)) +
  geom_point(color = "forestgreen", shape = 1, size = 1.5) +
  labs(title = "Age and ahiTotal scores before start of interventions", x = "age", y = "Pre-intervention ahiTotal Score")


## (6b): cesdTotal-scores
ggplot(data_wide, mapping = aes(x = age, y = cesdTotal.0)) +
  geom_point(color = "forestgreen", shape = 1, size = 1.5) +
  labs(title = "Age and cesdTotal scores before start of interventions", x = "age", y = "Pre-intervention cesdTotal Score")

Data transformation

Some exercises using the dplyr package.

ToDo: Put on separate page (and read in required data files).

EDA

What is exploratory data analysis (EDA)?

  1. Generate questions about your data.
  2. Search for answers by visualising, transforming, and modelling your data.
  3. Use what you learn to refine your questions and/or generate new questions.

Exercises on EDA:

Some exercises using the dplyr and ggplot2 packages.

ToDo: Put on separate page (and read in required data files).

Exercise 1

  1. How many participants are in each intervention group? Are the participants distributed equally? (Find the solution both graphically and by using data transformation.)

  2. Show the distribution of age values for both women and men.

  3. Provide descriptive statistics (i.e., mean, sd, min, max, etc.) for the age values for every intervention group.

# Exercise 1: 

# Load participant data: 
p_info <- readr::read_csv(file = "http://rpository.com/ds4psy/data/posPsy_participants.csv")
dim(p_info)  # 295 x 6 
#> [1] 295   6

# (a) How many participants are in each intervention group? Are the participants distributed equally?
#     (Find the solution both graphically and by using data transformation.) 

ggplot(p_info) +
  geom_bar(mapping = aes(x = intervention), position = "dodge", fill = "darkblue") + 
  labs(title = "Participants per intervention group", x = "Intervention", y = "Count") +
  theme_minimal()


p_info %>%
  group_by(intervention) %>% 
  count()
#> # A tibble: 4 x 2
#> # Groups:   intervention [4]
#>   intervention     n
#>          <int> <int>
#> 1            1    72
#> 2            2    76
#> 3            3    74
#> 4            4    73

# (b): Show distribution of age for women and men.

p_info$sex <- as.factor(p_info$sex)

ggplot(p_info, mapping = aes(x=age)) +
  geom_density(aes(color= sex)) +
  labs(title = "Distribution of age for men and women", x = "Age", y= "Density") +
  theme_minimal()


# (2c): What is the mean age and sd of age for every intervention group?

p_info %>%
  group_by(intervention) %>% 
  summarize(mean_age = mean(age, na.rm = TRUE), 
            sd_age = sd(age, na.rm = TRUE))
#> # A tibble: 4 x 3
#>   intervention mean_age sd_age
#>          <int>    <dbl>  <dbl>
#> 1            1     44.6   12.1
#> 2            2     45.4   12.5
#> 3            3     43.3   12.2
#> 4            4     41.7   12.8

Exercise 2

  • Explore how the distribution of of ahi total scores shifted from occasion 0 to 5. Why? Does that match your expectations?
  • Can you see any unusual patterns? What might explain them?
# Exercise 2:  
## (2a):Explore how the distribution of of ahi total scores shifted from occasion 0 to 5. Why? Does that match your expectations?
## Can you see any unusual patterns? What might explain them? 

## First, load data in the best suited format and delete redundant colums.

# load data: 
data_wide <- read_csv(file = "http://rpository.com/ds4psy/data/posPsy_data_wide.csv")  # online
dim(data_wide) 
#> [1] 295 294

# select colums of interest: 
data_wide <- select(data_wide, id:elapsed.days.0, contains("Total"), contains("occasion"), starts_with("elapsed."))

# View(data_wide)

ggplot(data_wide) +
  geom_freqpoly(mapping = aes(ahiTotal.0), color = "darkblue", binwidth = 2) +
  geom_freqpoly(mapping = aes(ahiTotal.5), color = "forestgreen", binwidth = 2) +
  theme_minimal() +
  labs(title = "AHI TOTAL SCORE DISTRIBUTION CHANGE", x = "AHI TOTAL SCORES", y = "COUNT")

Exercise 3

# Exercise 3:Let`s create plots of the improvement in ahitotal scores and cesdtotal in the course of the study (occasion 0 - occasion 5) with information on intervention groups. 

## In preparation for this expercise load required data.  and create a tibble containing the information needed (mean scores per intervention group for every occasion, info on intervention group, info on occasion)
 
# load data
AHI_CESD_2 <- read_csv(file = "http://rpository.com/ds4psy/data/posPsy_AHI_CESD_corrected.csv")


## (3a): How did the mean ahiscore change in the course of the study?
a <- AHI_CESD_2 %>% 
  group_by(occasion) %>% 
  summarize(mean_ahi = mean(ahiTotal, na.rm = TRUE),
            mean_cesd = mean(cesdTotal, na.rm = TRUE))

ggplot(a, mapping = aes(x=occasion, y= mean_ahi)) + 
  geom_point(mapping = aes(size = 1.5), color = "darkblue") +
  geom_line(color = "grey") +
  ggtitle("Ahi-scores in course of study") +
  theme_bw()


## (3b): How did the mean ahiscore change in the course of the study grouped by intervention?
## Create a tibble containing the information needed first. (mean scores per intervention group for every occasion, info on intervention group, info on occasion). 

# new tibble containing required data
b <- AHI_CESD_2 %>% 
  group_by(intervention, occasion) %>% 
  summarize(mean_ahi = mean(ahiTotal, na.rm = TRUE),
            mean_cesd = mean(cesdTotal, na.rm = TRUE))
b
#> # A tibble: 24 x 4
#> # Groups:   intervention [?]
#>    intervention occasion mean_ahi mean_cesd
#>           <int>    <int>    <dbl>     <dbl>
#>  1            1        0     68.4      15.1
#>  2            1        1     69.5      15.3
#>  3            1        2     70.3      13.6
#>  4            1        3     75.0      12  
#>  5            1        4     76.8      11.2
#>  6            1        5     75.5      13.5
#>  7            2        0     68.8      16.2
#>  8            2        1     71.6      14.6
#>  9            2        2     73.6      11.4
#> 10            2        3     72.5      12.5
#> # ... with 14 more rows

ggplot(b, mapping = aes(x=occasion, y= mean_ahi, color = intervention)) +
  geom_point(mapping = aes(size = 1.5)) +
  geom_line() +
  ggtitle("Mean ahi-score per intervention group over time")


# Why is the coloring not the way we want it to be?
b$intervention <- as.factor(b$intervention)

# Try again
ggplot(b, mapping = aes(x=occasion, y= mean_ahi, color = intervention)) +
  geom_point(mapping = aes(size = 1.5)) +
  geom_line() +
  ggtitle("Mean ahi-score per intervention group over time") +
  theme_minimal()


## (3c): Can you combin both plots in one?

ggplot() +
  geom_point(b, mapping = aes(x=occasion, y= mean_ahi, size = 1.1, color = intervention)) +
  geom_line(b, mapping = aes(x=occasion, y= mean_ahi, color = intervention)) +
  geom_point(a, mapping = aes(x=occasion, y= mean_ahi,size = 1), shape = 15) +
  geom_line(a, mapping = aes(x=occasion, y= mean_ahi, size = 1)) +
  ggtitle("Mean ahi-scores over time") +
  theme_light()


## (3d): What was the intervention for group 3? Is the time course reasonable?
# Group 3 had to write a thankyou letter and deliver it in person. This can explain the improvement from occasion 0 to 1. Afterwards they did not have any further tasks. The attained happiness through the letter and it`s delivery happens to decrease again. Very interessting that there is an increase from occasion 4 to 5 again. 

## (3e): Create the same plot as in exercise 3c for the cesd scores.
ggplot() +
  geom_point(b, mapping = aes(x=occasion, y= mean_cesd, size = 1, color = intervention)) +
  geom_line(b, mapping = aes(x=occasion, y= mean_cesd, color = intervention)) +
  geom_point(a, mapping = aes(x=occasion, y= mean_cesd,size = 0.7), shape = 15) +
  geom_line(a, mapping = aes(x=occasion, y= mean_cesd, size = 0.7)) +
  ggtitle("Mean cesd-scores over time") +
  theme_light()



## What implies the plot about the interventions?
## In three groups, the depression scores first decrease but then increas again towards the end. It could be the case that the intervention first succeed, but then they loose their power in course of the time.
ggplot() +
  geom_point(b, mapping = aes(x=occasion, y= mean_cesd, size = 1, color = intervention)) +
  geom_line(b, mapping = aes(x=occasion, y= mean_cesd, color = intervention)) +
  geom_point(a, mapping = aes(x=occasion, y= mean_cesd,size = 0.7), shape = 15) +
  geom_line(a, mapping = aes(x=occasion, y= mean_cesd, size = 0.7)) +
  ggtitle("Mean cesd-scores over time") +
  theme_light()

Exercise 4

# Exercise 4:  
## (4a): Create new variables indicating the improvement in ahi scores from occasion 0 to occasion 1, from occasion 1 to occasion 2, and so on. 
## First, load data in the best suited format and delete redundant colums.

#load data
data_wide <- read_csv(file = "http://rpository.com/ds4psy/data/posPsy_data_wide.csv")  # online
dim(data_wide) 
#> [1] 295 294

#select colums of interest
data_wide <- select(data_wide, id:elapsed.days.0, contains("Total"), contains("occasion"), starts_with("elapsed."))

#new variables

data_wide <- data_wide %>% 
  mutate(improve01ahi = ahiTotal.1 - ahiTotal.0, 
         improve12ahi = ahiTotal.2 - ahiTotal.1, 
         improve23ahi = ahiTotal.3 - ahiTotal.2,
         improve34ahi = ahiTotal.4 - ahiTotal.3,
         improve45ahi = ahiTotal.5 - ahiTotal.4)
dim(data_wide)#295 35
#> [1] 295  35


##(4b): Compute the mean improvements per intervention group
data_wide %>% 
  group_by(intervention) %>% 
  summarise(mean_improve01_i = mean(improve01ahi, na.rm = TRUE), 
            mean_improve12_i = mean(improve12ahi, na.rm = TRUE),
            mean_improve23_i = mean(improve23ahi, na.rm = TRUE),
            mean_improve34_i = mean(improve34ahi, na.rm = TRUE),
            mean_improve45_i = mean(improve45ahi, na.rm = TRUE))
#> # A tibble: 4 x 6
#>   intervention mean_improve01_i mean_improve12_i mean_improve23_i
#>          <int>            <dbl>            <dbl>            <dbl>
#> 1            1             4             -1.4               2.04 
#> 2            2             2.65           0.538            -0.025
#> 3            3             3.64           0.529             1.58 
#> 4            4             2.84           0.0667            0.364
#> # ... with 2 more variables: mean_improve34_i <dbl>,
#> #   mean_improve45_i <dbl>

##(4c): What is surprising about the results? 
#There are also negative improvements.


##(4d): Do women and men differ in their improvements?
#Try to find the solution graphically first. Then have a look on the numbers.
data_wide %>%
  group_by(intervention, sex) %>% 
  summarise(mean_improve01_i = mean(improve01ahi, na.rm = TRUE), 
            mean_improve12_i = mean(improve12ahi, na.rm = TRUE),
            mean_improve23_i = mean(improve23ahi, na.rm = TRUE),
            mean_improve34_i = mean(improve34ahi, na.rm = TRUE),
            mean_improve45_i = mean(improve45ahi, na.rm = TRUE))
#> # A tibble: 8 x 7
#> # Groups:   intervention [?]
#>   intervention   sex mean_improve01_i mean_improve12_i mean_improve23_i
#>          <int> <int>            <dbl>            <dbl>            <dbl>
#> 1            1     1             4.64           -1.17             2.07 
#> 2            1     2            -5              -4                1    
#> 3            2     1             2.57            0.485            0.618
#> 4            2     2             3.17            0.833           -3.67 
#> 5            3     1             3.74            0.5              1.12 
#> 6            3     2             2.5             1                2.71 
#> 7            4     1             3.66            0.522            0.769
#> 8            4     2             0              -1.43            -1.14 
#> # ... with 2 more variables: mean_improve34_i <dbl>,
#> #   mean_improve45_i <dbl>

Exercise 5

# Exercise 5:  
# (5a): Is there a difference in total improve regarding gender?

d <- data_wide %>%
  mutate(change_ahi = ahiTotal.5 - ahiTotal.0,
         change_cesd = cesdTotal.5 - cesdTotal.0)

d %>%
  group_by(sex) %>% 
  summarize(mean_change_ahi = mean(change_ahi, na.rm = TRUE), 
            mean_change_cesd = mean(change_cesd, na.rm = TRUE))
#> # A tibble: 2 x 3
#>     sex mean_change_ahi mean_change_cesd
#>   <int>           <dbl>            <dbl>
#> 1     1            5.56            -2.04
#> 2     2            3.47            -3.05


# (5c): Are there interventions which are more effective for men or women?
# Use data transformation commands to find the solution first, then create two plots (one for ahi and one for cesd scores)
e <- d%>%
  group_by(sex, intervention) %>% 
  summarize(mean_change_ahi = mean(change_ahi, na.rm = TRUE), 
            mean_change_cesd = mean(change_cesd, na.rm = TRUE))


# Change in ahi Scores
e$sex <- as.factor(e$sex)
ggplot(e) +
  geom_point(aes(x = intervention, y =mean_change_ahi, color = sex)) +
  labs(title = "Mean change in ahi scores for men and women", x = "intervention", y = "Change in ahi scores (mean)") +
  theme_light() +
  geom_hline(yintercept = 0)


# Change in cesd Scores
e$sex <- as.factor(e$sex)
ggplot(e) +
  geom_point(aes(x = intervention, y =mean_change_cesd, color = sex)) +
  labs(title = "Mean change in cesd scores for men and women", x = "intervention", y = "Change in cesd scores (mean)") +
  theme_light() +
  geom_hline(yintercept = 0)

Exercise 6

# Exercise 6: Dropout  
## (6a): How many participants dropped out at occasion 1, occasion 2, ..., occasion 5? (Dropout means "NA" at all remaining occasions, not only at one)

# load data
data_wide <- read_csv(file = "http://rpository.com/ds4psy/data/posPsy_data_wide.csv") 
dim(data_wide) #295 294
#> [1] 295 294


# Dropout at occasion 1

data_wide %>% 
  filter(is.na(occasion.1) & is.na(occasion.2) & is.na(occasion.3) & is.na(occasion.4) & is.na(occasion.5)) %>% 
  count() #93
#> # A tibble: 1 x 1
#>       n
#>   <int>
#> 1    93

#Dropout at occasion 2

data_wide %>% 
  filter(!is.na(occasion.1) & is.na(occasion.2) & is.na(occasion.3) & is.na(occasion.4) & is.na(occasion.5)) %>% 
  count() #23
#> # A tibble: 1 x 1
#>       n
#>   <int>
#> 1    23

#Dropout at occasion 3

data_wide %>% 
  filter(!is.na(occasion.1) & !is.na(occasion.2) & is.na(occasion.3) & is.na(occasion.4) & is.na(occasion.5)) %>% 
  count() #8
#> # A tibble: 1 x 1
#>       n
#>   <int>
#> 1     8


#Dropout at occasion 4

data_wide %>% 
  filter(!is.na(occasion.1) & !is.na(occasion.2) & !is.na(occasion.3) & is.na(occasion.4) & is.na(occasion.5)) %>% 
  count() #4
#> # A tibble: 1 x 1
#>       n
#>   <int>
#> 1     4

#Dropout at occasion 5

data_wide %>% 
  filter(!is.na(occasion.1) & !is.na(occasion.2) & !is.na(occasion.3) & !is.na(occasion.4) & is.na(occasion.5)) %>% 
  count() #15
#> # A tibble: 1 x 1
#>       n
#>   <int>
#> 1    15


## (6b): How many participants dropped out in total?
93+23+8+4+5 #133
#> [1] 133

Exercise 7

#Exercise 7: Participants with stable scores 
##You computed new variables with information on the changes in ahi and cesd total scores from occasion 0 to occasion 5 in exercise 5. Compute the variables again or take the data from exercise 5

data_wide <- read_csv(file = "http://rpository.com/ds4psy/data/posPsy_data_wide.csv") 
dim(data_wide) #295 294
#> [1] 295 294

data_wide <- select(data_wide, id:elapsed.days.0, contains("Total"), contains("occasion"), starts_with("elapsed."))

e <- data_wide %>% 
  mutate(change_ahi = ahiTotal.5 - ahiTotal.0,
         change_cesd = cesdTotal.5 - cesdTotal.0)

##(7a): Are there participants whose ahiTotal scores did not change? If yes: how many are there?
e %>%
  filter(change_ahi == 0) #yes
#> # A tibble: 5 x 32
#>      id intervention   sex   age  educ income occasion.0 elapsed.days.0
#>   <int>        <int> <int> <int> <int>  <int>      <int>          <int>
#> 1    66            2     1    51     5      3          0              0
#> 2    83            4     1    71     2      1          0              0
#> 3    92            3     1    60     5      3          0              0
#> 4   113            1     2    42     4      1          0              0
#> 5   173            3     2    48     5      3          0              0
#> # ... with 24 more variables: ahiTotal.0 <int>, cesdTotal.0 <int>,
#> #   ahiTotal.1 <int>, cesdTotal.1 <int>, ahiTotal.2 <int>,
#> #   cesdTotal.2 <int>, ahiTotal.3 <int>, cesdTotal.3 <int>,
#> #   ahiTotal.4 <int>, cesdTotal.4 <int>, ahiTotal.5 <int>,
#> #   cesdTotal.5 <int>, occasion.1 <int>, occasion.2 <int>,
#> #   occasion.3 <int>, occasion.4 <int>, occasion.5 <int>,
#> #   elapsed.days.1 <dbl>, elapsed.days.2 <dbl>, elapsed.days.3 <dbl>,
#> #   elapsed.days.4 <dbl>, elapsed.days.5 <dbl>, change_ahi <int>,
#> #   change_cesd <int>

e %>%
  filter(change_ahi == 0) %>% 
  count() #5
#> # A tibble: 1 x 1
#>       n
#>   <int>
#> 1     5

##(7b): Are there participants whose cesdTotal scores did not change? If yes: how many are there?
e %>%
  filter(change_cesd == 0) #yes
#> # A tibble: 8 x 32
#>      id intervention   sex   age  educ income occasion.0 elapsed.days.0
#>   <int>        <int> <int> <int> <int>  <int>      <int>          <int>
#> 1    34            1     1    43     5      3          0              0
#> 2    43            3     1    24     4      3          0              0
#> 3   113            1     2    42     4      1          0              0
#> 4   135            4     1    59     5      3          0              0
#> 5   156            4     1    26     3      2          0              0
#> 6   182            3     1    55     5      3          0              0
#> 7   187            1     1    52     5      2          0              0
#> 8   245            4     1    29     2      1          0              0
#> # ... with 24 more variables: ahiTotal.0 <int>, cesdTotal.0 <int>,
#> #   ahiTotal.1 <int>, cesdTotal.1 <int>, ahiTotal.2 <int>,
#> #   cesdTotal.2 <int>, ahiTotal.3 <int>, cesdTotal.3 <int>,
#> #   ahiTotal.4 <int>, cesdTotal.4 <int>, ahiTotal.5 <int>,
#> #   cesdTotal.5 <int>, occasion.1 <int>, occasion.2 <int>,
#> #   occasion.3 <int>, occasion.4 <int>, occasion.5 <int>,
#> #   elapsed.days.1 <dbl>, elapsed.days.2 <dbl>, elapsed.days.3 <dbl>,
#> #   elapsed.days.4 <dbl>, elapsed.days.5 <dbl>, change_ahi <int>,
#> #   change_cesd <int>

e %>%
  filter(change_cesd == 0) %>% 
  count() #8
#> # A tibble: 1 x 1
#>       n
#>   <int>
#> 1     8


##(7c): Are there participants whose ahiTotal score and cesdTotal score did not change? If yes: how many are there?
e %>%
  filter(change_cesd == 0, change_ahi == 0) #yes
#> # A tibble: 1 x 32
#>      id intervention   sex   age  educ income occasion.0 elapsed.days.0
#>   <int>        <int> <int> <int> <int>  <int>      <int>          <int>
#> 1   113            1     2    42     4      1          0              0
#> # ... with 24 more variables: ahiTotal.0 <int>, cesdTotal.0 <int>,
#> #   ahiTotal.1 <int>, cesdTotal.1 <int>, ahiTotal.2 <int>,
#> #   cesdTotal.2 <int>, ahiTotal.3 <int>, cesdTotal.3 <int>,
#> #   ahiTotal.4 <int>, cesdTotal.4 <int>, ahiTotal.5 <int>,
#> #   cesdTotal.5 <int>, occasion.1 <int>, occasion.2 <int>,
#> #   occasion.3 <int>, occasion.4 <int>, occasion.5 <int>,
#> #   elapsed.days.1 <dbl>, elapsed.days.2 <dbl>, elapsed.days.3 <dbl>,
#> #   elapsed.days.4 <dbl>, elapsed.days.5 <dbl>, change_ahi <int>,
#> #   change_cesd <int>

e %>%
  filter(change_cesd == 0, change_ahi == 0) %>% 
  count() #1
#> # A tibble: 1 x 1
#>       n
#>   <int>
#> 1     1


##(7d): Who is it?
e %>%
  filter(change_cesd == 0, change_ahi == 0) %>% 
  select(id)
#> # A tibble: 1 x 1
#>      id
#>   <int>
#> 1   113

#participant with id 113


##(7e): For this particpiant plot the scores over time.

AHI_CESD_2 <- read_csv(file = "http://rpository.com/ds4psy/data/posPsy_AHI_CESD_corrected.csv")  
dim(AHI_CESD_2) #990 50
#> [1] 990  50


AHI_CESD_2 <- AHI_CESD_2 %>% 
  filter(id == "113")
AHI_CESD_2
#> # A tibble: 5 x 50
#>      id occasion elapsed.days intervention ahi01 ahi02 ahi03 ahi04 ahi05
#>   <int>    <int>        <dbl>        <int> <int> <int> <int> <int> <int>
#> 1   113        0          0              1     1     1     4     2     2
#> 2   113        1         11.5            1     1     1     2     3     2
#> 3   113        2         17.5            1     2     1     1     1     1
#> 4   113        4         93.5            1     2     1     1     2     2
#> 5   113        5        182.             1     1     1     4     2     1
#> # ... with 41 more variables: ahi06 <int>, ahi07 <int>, ahi08 <int>,
#> #   ahi09 <int>, ahi10 <int>, ahi11 <int>, ahi12 <int>, ahi13 <int>,
#> #   ahi14 <int>, ahi15 <int>, ahi16 <int>, ahi17 <int>, ahi18 <int>,
#> #   ahi19 <int>, ahi20 <int>, ahi21 <int>, ahi22 <int>, ahi23 <int>,
#> #   ahi24 <int>, cesd01 <int>, cesd02 <int>, cesd03 <int>, cesd04 <int>,
#> #   cesd05 <int>, cesd06 <int>, cesd07 <int>, cesd08 <int>, cesd09 <int>,
#> #   cesd10 <int>, cesd11 <int>, cesd12 <int>, cesd13 <int>, cesd14 <int>,
#> #   cesd15 <int>, cesd16 <int>, cesd17 <int>, cesd18 <int>, cesd19 <int>,
#> #   cesd20 <int>, ahiTotal <int>, cesdTotal <int>

ggplot(AHI_CESD_2) +
  geom_point(aes(x = occasion, y = ahiTotal), color = "darkblue", size = 2) +
  geom_line(aes(x = occasion, y = ahiTotal), color = "darkblue") +
  geom_point(aes(x = occasion, y = cesdTotal), color = "forestgreen", size = 2) +
  geom_line(aes(x = occasion, y = cesdTotal), color = "forestgreen") +
  labs(title = "Participant 113 ahiTotal scores (blue) and cesdTotalscores (green) over time", x = "occasion", y = "Total scores") +
  theme_light()

References

  • Seligman, M. E., Steen, T. A., Park, N., & Peterson, C. (2005). Positive psychology progress: Empirical validation of interventions. American Psychologist, 60(5), 410–421.

  • Woodworth, R. J., O’Brien‐Malone, A., Diamond, M. R., & Schüz, B. (2017). Web‐based positive psychology interventions: A reexamination of effectiveness. Journal of Clinical Psychology, 73(3), 218–232.

  • Woodworth, R. J., O’Brien-Malone, A., Diamond, M. R. and Schüz, B. (2018). Data from, ‘Web-based positive psychology interventions: A reexamination of effectiveness’. Journal of Open Psychology Data, 6: 1. DOI: https://doi.org/10.5334/jopd.35

  • Data at https://doi.org/10.6084/m9.figshare.1577563.v1.

[Last update on 2018-11-26 11:27:13 by hn.]